Sfoglia il codice sorgente

launceston updates

Benjamin Harris 2 mesi fa
parent
commit
3fc874c
1 ha cambiato i file con 12 aggiunte e 7 eliminazioni
  1. 12 7
      scrapers/launcestoncity.rb

+ 12 - 7
scrapers/launcestoncity.rb

@@ -202,8 +202,13 @@ def probe_common_docs(base_url:, key:, danum:, referer:)
     "Onsite Notice ",  # trailing space variant seen on this site
   ]
 
-  prefix = "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{danum_slug} - "
-  candidates = names.map { |n| "#{prefix}#{n}.pdf" }
+  # Build candidates with percent-encoded filenames (spaces → %20).
+  # URI.parse rejects bare spaces, so the filename portion must be encoded.
+  candidates = names.map do |n|
+    filename = "#{danum_slug} - #{n}.pdf"
+    encoded  = filename.gsub(" ", "%20")
+    "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}"
+  end
 
   found = []
   candidates.each do |pdf_url|
@@ -330,6 +335,7 @@ tables.each do |t|
 			documents = [] if documents.nil?
 			anchors_added = 0
 			used_url = nil
+			probe_done = false  # ensure probe_common_docs fires at most once per DA
 
 			referers = [
 			  info_url, # details page
@@ -373,10 +379,9 @@ tables.each do |t|
 					anchors_added += 1
 				  end
 
-				  # Regex fallback
-				  if anchors_added == 0
-				  # Final fallback: probe known filenames directly
-				  # Extract KEY and DANUM from the original doc_list_url
+				  # Final fallback: probe known filenames directly (runs at most once per DA)
+				  if anchors_added == 0 && !probe_done
+					probe_done = true
 				  begin
 					u = URI.parse(doc_list_url)
 					q = URI.decode_www_form(u.query || "").to_h
@@ -387,7 +392,7 @@ tables.each do |t|
 						  base_url: BASE_URL,
 						  key: key,
 						  danum: danum,
-						  referer: doc_list_url # better context for this server
+						  referer: doc_list_url
 						)
 					  documents.concat(probed)
 					  anchors_added = probed.size if probed.any?