Benjamin Harris 2 miesięcy temu
rodzic
commit
b5b53bc297
1 zmienionych plików z 19 dodań i 7 usunięć
  1. 19 7
      scrapers/burnie.rb

+ 19 - 7
scrapers/burnie.rb

@@ -109,6 +109,9 @@ def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "non
   body  = ""
 
   while limit > 0
+    limit -= 1
+    redirect_to = nil
+
     req = Net::HTTP::Get.new(uri, hdrs)
     Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
       resp = http.request(req)
@@ -118,14 +121,19 @@ def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "non
       code = resp.code.to_i
 
       if [301, 302, 303, 307, 308].include?(code) && resp["location"]
-        uri = URI.join(uri, resp["location"])
-        limit -= 1
-        next
+        # Flag the redirect so the while loop can retry; `next` here only
+        # exits the Net::HTTP.start block, not the while loop.
+        redirect_to = URI.join(uri, resp["location"])
+      else
+        # For HTML we decompress; for PDF we only requested gzip/deflate off,
+        # so this remains identity unless server forces it (we still handle).
+        body = decompress(resp.body.to_s, enc)
       end
+    end
 
-      # For HTML we decompress; for PDF we only requested gzip/deflate off,
-      # so this remains identity unless server forces it (we still handle).
-      body = decompress(resp.body.to_s, enc)
+    if redirect_to
+      uri = redirect_to
+      next
     end
     break
   end
@@ -175,7 +183,11 @@ def first_pdf_on_detail(detail_url, jar)
   a = doc.at_css(".hyperlink-button-container a.ext-pdf") ||
       doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")
   return "" unless a
-  URI.join(detail_url, a["href"].to_s).to_s
+  href = a["href"].to_s
+  # Percent-encode non-ASCII characters (e.g. en-dash in filename) so URI.join
+  # doesn't raise URI::InvalidURIError. ASCII-safe characters are left as-is.
+  href = href.gsub(/[^\x00-\x7F]/) { |c| URI::DEFAULT_PARSER.escape(c) }
+  URI.join(detail_url, href).to_s
 rescue StandardError => e
   Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
   ""