Benjamin Harris před 2 měsíci
rodič
revize
94d2bbaa79
1 změnil soubory, kde provedl 6 přidání a 4 odebrání
  1. 6 4
      scrapers/westtamar.rb

+ 6 - 4
scrapers/westtamar.rb

@@ -44,7 +44,7 @@ def download_pdf(url, council_reference)
   fname = "document.pdf" if fname.empty?
   path  = File.join(dir, fname)
 
-  body = Http.get(url)
+  body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
   File.binwrite(path, body)
   puts "  saved #{fname} (#{body.bytesize} bytes)"
 
@@ -95,9 +95,11 @@ doc.css("h2").each do |h2|
   ul_node  = sibling_nodes.find { |n| n.name == "ul" }
   ul_text  = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
 
-  # Find the <p> with a PDF link
-  pdf_p    = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
-  pdf_link = pdf_p&.at_css("a[href]")
+  # PDF link lives inside a <li> within the <ul>
+  pdf_link = ul_node&.css("li a[href]")&.find { |a| a["href"].to_s =~ /\.pdf/i }
+  # Fallback: any element in the section with a .pdf href
+  pdf_link ||= sibling_nodes.flat_map { |n| n.css("a[href]").to_a }
+                             .find { |a| a["href"].to_s =~ /\.pdf/i }
 
   # --- Reference: "PA NO: 2025065" from ul, or filename ---
   ref = nil