|
|
@@ -44,7 +44,7 @@ def download_pdf(url, council_reference)
|
|
|
fname = "document.pdf" if fname.empty?
|
|
|
path = File.join(dir, fname)
|
|
|
|
|
|
- body = Http.get(url)
|
|
|
+ body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
|
|
|
File.binwrite(path, body)
|
|
|
puts " saved #{fname} (#{body.bytesize} bytes)"
|
|
|
|
|
|
@@ -95,9 +95,11 @@ doc.css("h2").each do |h2|
|
|
|
ul_node = sibling_nodes.find { |n| n.name == "ul" }
|
|
|
ul_text = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
|
|
|
|
|
|
- # Find the <p> with a PDF link
|
|
|
- pdf_p = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
|
|
|
- pdf_link = pdf_p&.at_css("a[href]")
|
|
|
+ # PDF link lives inside a <li> within the <ul>
|
|
|
+ pdf_link = ul_node&.css("li a[href]")&.find { |a| a["href"].to_s =~ /\.pdf/i }
|
|
|
+ # Fallback: any element in the section with a .pdf href
|
|
|
+ pdf_link ||= sibling_nodes.flat_map { |n| n.css("a[href]").to_a }
|
|
|
+ .find { |a| a["href"].to_s =~ /\.pdf/i }
|
|
|
|
|
|
# --- Reference: "PA NO: 2025065" from ul, or filename ---
|
|
|
ref = nil
|