|
|
@@ -109,6 +109,9 @@ def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "non
|
|
|
body = ""
|
|
|
|
|
|
while limit > 0
|
|
|
+ limit -= 1
|
|
|
+ redirect_to = nil
|
|
|
+
|
|
|
req = Net::HTTP::Get.new(uri, hdrs)
|
|
|
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
|
|
|
resp = http.request(req)
|
|
|
@@ -118,14 +121,19 @@ def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "non
|
|
|
code = resp.code.to_i
|
|
|
|
|
|
if [301, 302, 303, 307, 308].include?(code) && resp["location"]
|
|
|
- uri = URI.join(uri, resp["location"])
|
|
|
- limit -= 1
|
|
|
- next
|
|
|
+ # Flag the redirect so the while loop can retry; `next` here only
|
|
|
+ # exits the Net::HTTP.start block, not the while loop.
|
|
|
+ redirect_to = URI.join(uri, resp["location"])
|
|
|
+ else
|
|
|
+ # For HTML we decompress; for PDF we only requested gzip/deflate off,
|
|
|
+ # so this remains identity unless server forces it (we still handle).
|
|
|
+ body = decompress(resp.body.to_s, enc)
|
|
|
end
|
|
|
+ end
|
|
|
|
|
|
- # For HTML we decompress; for PDF we only requested gzip/deflate off,
|
|
|
- # so this remains identity unless server forces it (we still handle).
|
|
|
- body = decompress(resp.body.to_s, enc)
|
|
|
+ if redirect_to
|
|
|
+ uri = redirect_to
|
|
|
+ next
|
|
|
end
|
|
|
break
|
|
|
end
|
|
|
@@ -175,7 +183,11 @@ def first_pdf_on_detail(detail_url, jar)
|
|
|
a = doc.at_css(".hyperlink-button-container a.ext-pdf") ||
|
|
|
doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")
|
|
|
return "" unless a
|
|
|
- URI.join(detail_url, a["href"].to_s).to_s
|
|
|
+ href = a["href"].to_s
|
|
|
+ # Percent-encode non-ASCII characters (e.g. en-dash in filename) so URI.join
|
|
|
+ # doesn't raise URI::InvalidURIError. ASCII-safe characters are left as-is.
|
|
|
+ href = href.gsub(/[^\x00-\x7F]/) { |c| URI::DEFAULT_PARSER.escape(c) }
|
|
|
+ URI.join(detail_url, href).to_s
|
|
|
rescue StandardError => e
|
|
|
Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
|
|
|
""
|