Kaynağa Gözat

WestTamar Fix

Benjamin Harris 2 ay önce
ebeveyn
işleme
20a738a15a
1 değiştirilmiş dosya ile 130 ekleme ve 107 silme
  1. 130 107
      scrapers/westtamar.rb

+ 130 - 107
scrapers/westtamar.rb

@@ -1,144 +1,167 @@
 # West Tamar Council — Advertised Planning Applications
+#
+# Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
+#
+# Page structure — all entries on one page, grouped by h2 headings:
+#
+#   <h2>92 Sunset Boulevard, Clarence Point</h2>
+#   <p>
+#     <strong>APPLICANT:</strong> J & E West<br>
+#     <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
+#     <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
+#     <strong>CLOSES:</strong> 5pm on 16 April 2026
+#   </p>
+#   <ul>
+#     <li>Application Number: PA NO: 2025065</li>
+#     <li>Closes 16 April 2026</li>
+#   </ul>
+#   <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
 
 require "nokogiri"
+require "uri"
+require "fileutils"
 
 require_relative "../lib/scraper_helpers"
 require_relative "../lib/util"
 require_relative "../lib/log"
-TABLE    = ENV.fetch("TABLE_NAME")   # run_all.sh -> da_westtamar
-URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
+
+TABLE                = ENV.fetch("TABLE_NAME")
+URL                  = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
+DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"
 
 DB.ensure_table!(TABLE)
 
-REF_RX_SLASH  = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
-REF_RX_HYPHEN = %r{\bDA\s*(\d{1,4})\s*-\s*(20\d{2})\b}i
-def extract_ref(text)
-  s = text.to_s
-  if (m = s.match(REF_RX_SLASH))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  if (m = s.match(REF_RX_HYPHEN))
-    return "DA #{m[2]} / #{m[1]}"
-  end
-  if (m = s.match(/\bDA(20\d{2})(\d{3,})\b/i))
-    return "DA #{m[1]} / #{m[2]}"
-  end
+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
+
+def download_pdf(url, council_reference)
+  return nil if url.to_s.strip.empty?
+
+  dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
+  FileUtils.mkdir_p(dir)
+
+  fname = safe_name(File.basename(URI.parse(url).path))
+  fname = "document.pdf" if fname.empty?
+  path  = File.join(dir, fname)
+
+  body = Http.get(url)
+  File.binwrite(path, body)
+  puts "  saved #{fname} (#{body.bytesize} bytes)"
+
+  "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
+rescue StandardError => e
+  Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
   nil
 end
 
-def extract_date_like(str)
-  s = str.to_s
-  return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
-  return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
-  return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
-  ""
+# Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
+def parse_strong_labels(p_node)
+  kv = {}
+  return kv unless p_node
+
+  # Replace <br> with newlines so we can split cleanly
+  html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
+  Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
+    line = line.gsub(/\u00a0|\s+/, " ").strip
+    next if line.empty?
+    if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
+      kv[m[1].strip.upcase] = m[2].strip
+    end
+  end
+  kv
 end
 
-def extract_on_notice_raw(text)
-  s = text.to_s.gsub(/\s+/, " ")
-  if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
-    d = extract_date_like($2)
-    return d unless d.empty?
-  end
-  if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
-    d = extract_date_like($2)
-    return d unless d.empty?
+html     = Http.get(URL)
+doc      = Nokogiri::HTML(html)
+items    = []
+
+# Walk h2 elements; collect their following siblings until the next h2
+doc.css("h2").each do |h2|
+  sibling_nodes = []
+  sib = h2.next_sibling
+  while sib
+    break if sib.element? && sib.name == "h2"
+    sibling_nodes << sib if sib.element?
+    sib = sib.next_sibling
   end
-  extract_date_like(s)
-end
 
-def parse_detail(url)
-  html = Http.get(url)
-  doc  = Nokogiri::HTML(html)
+  next if sibling_nodes.empty?
 
-  # Try two-column detail table first
-  kv = {}
-  doc.css("table tr").each do |tr|
-    cells = tr.css("th, td")
-    next unless cells.length >= 2
-    key = cells[0].text.strip
-    val = cells[1].text.strip
-    kv[key] = val unless key.empty?
-  end
+  # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
+  label_p  = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
+  kv       = parse_strong_labels(label_p)
 
-  find = ->(rx) { kv.find { |k,_| k =~ rx }&.last.to_s.strip }
+  # Find the <ul> containing the application number
+  ul_node  = sibling_nodes.find { |n| n.name == "ul" }
+  ul_text  = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
 
-  council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
-  address           = find.call(/(Address|Location|Property)/i)
-  description       = find.call(/(Proposal|Description)/i)
-  on_notice_raw     = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
-  on_notice         = Util.parse_aus_date(on_notice_raw)
-  title_reference   = doc.at_css("h1, .entry-title")&.text&.strip.to_s
+  # Find the <p> with a PDF link
+  pdf_p    = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
+  pdf_link = pdf_p&.at_css("a[href]")
 
-  # Fallbacks from page text if labels are missing
-  if council_reference.empty?
-    council_reference = extract_ref(title_reference) || extract_ref(doc.text)
+  # --- Reference: "PA NO: 2025065" from ul, or filename ---
+  ref = nil
+  if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
+    ref = "PA #{m[1]}"
   end
-  address = title_reference if address.empty?
-  description = "Development Application" if description.to_s.strip.empty?
-  if on_notice.nil?
-    guess = extract_on_notice_raw(doc.text)
-    on_notice = Util.parse_aus_date(guess)
-    on_notice_raw = guess if on_notice
+  if ref.nil? && pdf_link
+    href = pdf_link["href"].to_s
+    ref  = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
   end
-
-  pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
-  document_url = pdf ? abs_url(url, pdf) : ""
-
-  return nil if council_reference.empty? || address.empty?
-
-  {
-    council_reference: council_reference,
-    address: address,
-    description: description,
-    date_received: on_notice,
-    date_received_raw: on_notice_raw.to_s,
-    document_url: document_url,
-    title_reference: title_reference
+  next unless ref
+
+  # --- Address from LOCATION label, fallback to h2 text ---
+  address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
+  next if address.empty?
+
+  # --- Other fields ---
+  applicant   = kv["APPLICANT"].to_s
+  description = kv["PROPOSAL"].to_s
+  description = "Development Application" if description.empty?
+
+  closes_raw  = kv["CLOSES"].to_s
+  # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
+  closes_raw  = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
+  # Also try list item: "Closes 16 April 2026"
+  if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
+    closes_raw = m[1]
+  end
+  on_notice_to = Util.parse_aus_date(closes_raw)
+
+  document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
+
+  items << {
+    council_reference: ref,
+    address:           address,
+    description:       description,
+    applicant:         applicant,
+    on_notice_to:      on_notice_to,
+    on_notice_to_raw:  closes_raw,
+    document_url:      document_url
   }
 end
 
-list_html = Http.get(URL)
-list_doc  = Nokogiri::HTML(list_html)
-
-detail_links = list_doc.css("article h2 a, .entry-content a").map { |a|
-  href = a["href"].to_s
-  next if href.strip.empty? || href.start_with?("#")
-  abs_url(URL, href)
-}.compact.uniq
-
-puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
+puts "Found #{items.length} item(s) for #{TABLE}"
 
-saved = 0
-
-detail_links.each do |u|
-  begin
-    item = parse_detail(u)
-  rescue StandardError => e
-    Log.warn "scraper", "Skip #{u}: #{e.class} #{e.message}"
-    next
-  end
-  next unless item
+items.each do |r|
+  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
 
   upsert_and_enrich!(
     table: TABLE,
     row: {
-      description: item[:description],
-      date_received: item[:date_received],
-      date_received_raw: item[:date_received_raw],
-      address: item[:address],
-      council_reference: item[:council_reference],
-      applicant: "",
-      owner: ""
+      council_reference: r[:council_reference],
+      address:           r[:address],
+      description:       r[:description],
+      applicant:         r[:applicant],
+      on_notice_to:      r[:on_notice_to],
+      on_notice_to_raw:  r[:on_notice_to_raw],
+      owner:             ""
     },
     extras: {
-      document_url:    item[:document_url],
-      on_notice_to:    item[:date_received],
-      on_notice_to_raw: item[:date_received_raw],
-      title_reference: item[:title_reference]
+      document_url:       r[:document_url],
+      local_document_url: local_url
     }
   )
-  saved += 1
 end
 
-puts "Done #{TABLE}. Saved #{saved} item(s)."
+puts "Done #{TABLE}. Saved #{items.length} item(s)."