2 ay önce · 20a738a15a
--- a/scrapers/westtamar.rb
+++ b/scrapers/westtamar.rb
@@ -1,144 +1,167 @@
 
				 # West Tamar Council — Advertised Planning Applications
			
 
				+#
			
 
				+# Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
			
 
				+#
			
 
				+# Page structure — all entries on one page, grouped by h2 headings:
			
 
				+#
			
 
				+#   <h2>92 Sunset Boulevard, Clarence Point</h2>
			
 
				+#   <p>
			
 
				+#     <strong>APPLICANT:</strong> J & E West<br>
			
 
				+#     <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
			
 
				+#     <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
			
 
				+#     <strong>CLOSES:</strong> 5pm on 16 April 2026
			
 
				+#   </p>
			
 
				+#   <ul>
			
 
				+#     <li>Application Number: PA NO: 2025065</li>
			
 
				+#     <li>Closes 16 April 2026</li>
			
 
				+#   </ul>
			
 
				+#   <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
			
 
				 
			
 
				 require "nokogiri"
			
 
				+require "uri"
			
 
				+require "fileutils"
			
 
				 
			
 
				 require_relative "../lib/scraper_helpers"
			
 
				 require_relative "../lib/util"
			
 
				 require_relative "../lib/log"
			
 
				-TABLE    = ENV.fetch("TABLE_NAME")   # run_all.sh -> da_westtamar
			
 
				-URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
			
 
				+
			
 
				+TABLE                = ENV.fetch("TABLE_NAME")
			
 
				+URL                  = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
			
 
				+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
			
 
				+DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"
			
 
				 
			
 
				 DB.ensure_table!(TABLE)
			
 
				 
			
 
				-REF_RX_SLASH  = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
			
 
				-REF_RX_HYPHEN = %r{\bDA\s*(\d{1,4})\s*-\s*(20\d{2})\b}i
			
 
				-def extract_ref(text)
			
 
				-  s = text.to_s
			
 
				-  if (m = s.match(REF_RX_SLASH))
			
 
				-    return "DA #{m[1]} / #{m[2]}"
			
 
				-  end
			
 
				-  if (m = s.match(REF_RX_HYPHEN))
			
 
				-    return "DA #{m[2]} / #{m[1]}"
			
 
				-  end
			
 
				-  if (m = s.match(/\bDA(20\d{2})(\d{3,})\b/i))
			
 
				-    return "DA #{m[1]} / #{m[2]}"
			
 
				-  end
			
 
				+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
			
 
				+
			
 
				+def download_pdf(url, council_reference)
			
 
				+  return nil if url.to_s.strip.empty?
			
 
				+
			
 
				+  dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
			
 
				+  FileUtils.mkdir_p(dir)
			
 
				+
			
 
				+  fname = safe_name(File.basename(URI.parse(url).path))
			
 
				+  fname = "document.pdf" if fname.empty?
			
 
				+  path  = File.join(dir, fname)
			
 
				+
			
 
				+  body = Http.get(url)
			
 
				+  File.binwrite(path, body)
			
 
				+  puts "  saved #{fname} (#{body.bytesize} bytes)"
			
 
				+
			
 
				+  "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
			
 
				+rescue StandardError => e
			
 
				+  Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
			
 
				   nil
			
 
				 end
			
 
				 
			
 
				-def extract_date_like(str)
			
 
				-  s = str.to_s
			
 
				-  return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
			
 
				-  return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
			
 
				-  return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
			
 
				-  ""
			
 
				+# Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
			
 
				+def parse_strong_labels(p_node)
			
 
				+  kv = {}
			
 
				+  return kv unless p_node
			
 
				+
			
 
				+  # Replace <br> with newlines so we can split cleanly
			
 
				+  html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
			
 
				+  Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
			
 
				+    line = line.gsub(/\u00a0|\s+/, " ").strip
			
 
				+    next if line.empty?
			
 
				+    if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
			
 
				+      kv[m[1].strip.upcase] = m[2].strip
			
 
				+    end
			
 
				+  end
			
 
				+  kv
			
 
				 end
			
 
				 
			
 
				-def extract_on_notice_raw(text)
			
 
				-  s = text.to_s.gsub(/\s+/, " ")
			
 
				-  if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
			
 
				-    d = extract_date_like($2)
			
 
				-    return d unless d.empty?
			
 
				-  end
			
 
				-  if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
			
 
				-    d = extract_date_like($2)
			
 
				-    return d unless d.empty?
			
 
				+html     = Http.get(URL)
			
 
				+doc      = Nokogiri::HTML(html)
			
 
				+items    = []
			
 
				+
			
 
				+# Walk h2 elements; collect their following siblings until the next h2
			
 
				+doc.css("h2").each do |h2|
			
 
				+  sibling_nodes = []
			
 
				+  sib = h2.next_sibling
			
 
				+  while sib
			
 
				+    break if sib.element? && sib.name == "h2"
			
 
				+    sibling_nodes << sib if sib.element?
			
 
				+    sib = sib.next_sibling
			
 
				   end
			
 
				-  extract_date_like(s)
			
 
				-end
			
 
				 
			
 
				-def parse_detail(url)
			
 
				-  html = Http.get(url)
			
 
				-  doc  = Nokogiri::HTML(html)
			
 
				+  next if sibling_nodes.empty?
			
 
				 
			
 
				-  # Try two-column detail table first
			
 
				-  kv = {}
			
 
				-  doc.css("table tr").each do |tr|
			
 
				-    cells = tr.css("th, td")
			
 
				-    next unless cells.length >= 2
			
 
				-    key = cells[0].text.strip
			
 
				-    val = cells[1].text.strip
			
 
				-    kv[key] = val unless key.empty?
			
 
				-  end
			
 
				+  # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
			
 
				+  label_p  = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
			
 
				+  kv       = parse_strong_labels(label_p)
			
 
				 
			
 
				-  find = ->(rx) { kv.find { |k,_| k =~ rx }&.last.to_s.strip }
			
 
				+  # Find the <ul> containing the application number
			
 
				+  ul_node  = sibling_nodes.find { |n| n.name == "ul" }
			
 
				+  ul_text  = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
			
 
				 
			
 
				-  council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
			
 
				-  address           = find.call(/(Address|Location|Property)/i)
			
 
				-  description       = find.call(/(Proposal|Description)/i)
			
 
				-  on_notice_raw     = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
			
 
				-  on_notice         = Util.parse_aus_date(on_notice_raw)
			
 
				-  title_reference   = doc.at_css("h1, .entry-title")&.text&.strip.to_s
			
 
				+  # Find the <p> with a PDF link
			
 
				+  pdf_p    = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
			
 
				+  pdf_link = pdf_p&.at_css("a[href]")
			
 
				 
			
 
				-  # Fallbacks from page text if labels are missing
			
 
				-  if council_reference.empty?
			
 
				-    council_reference = extract_ref(title_reference) || extract_ref(doc.text)
			
 
				+  # --- Reference: "PA NO: 2025065" from ul, or filename ---
			
 
				+  ref = nil
			
 
				+  if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
			
 
				+    ref = "PA #{m[1]}"
			
 
				   end
			
 
				-  address = title_reference if address.empty?
			
 
				-  description = "Development Application" if description.to_s.strip.empty?
			
 
				-  if on_notice.nil?
			
 
				-    guess = extract_on_notice_raw(doc.text)
			
 
				-    on_notice = Util.parse_aus_date(guess)
			
 
				-    on_notice_raw = guess if on_notice
			
 
				+  if ref.nil? && pdf_link
			
 
				+    href = pdf_link["href"].to_s
			
 
				+    ref  = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
			
 
				   end
			
 
				-
			
 
				-  pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
			
 
				-  document_url = pdf ? abs_url(url, pdf) : ""
			
 
				-
			
 
				-  return nil if council_reference.empty? || address.empty?
			
 
				-
			
 
				-  {
			
 
				-    council_reference: council_reference,
			
 
				-    address: address,
			
 
				-    description: description,
			
 
				-    date_received: on_notice,
			
 
				-    date_received_raw: on_notice_raw.to_s,
			
 
				-    document_url: document_url,
			
 
				-    title_reference: title_reference
			
 
				+  next unless ref
			
 
				+
			
 
				+  # --- Address from LOCATION label, fallback to h2 text ---
			
 
				+  address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
			
 
				+  next if address.empty?
			
 
				+
			
 
				+  # --- Other fields ---
			
 
				+  applicant   = kv["APPLICANT"].to_s
			
 
				+  description = kv["PROPOSAL"].to_s
			
 
				+  description = "Development Application" if description.empty?
			
 
				+
			
 
				+  closes_raw  = kv["CLOSES"].to_s
			
 
				+  # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
			
 
				+  closes_raw  = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
			
 
				+  # Also try list item: "Closes 16 April 2026"
			
 
				+  if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
			
 
				+    closes_raw = m[1]
			
 
				+  end
			
 
				+  on_notice_to = Util.parse_aus_date(closes_raw)
			
 
				+
			
 
				+  document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
			
 
				+
			
 
				+  items << {
			
 
				+    council_reference: ref,
			
 
				+    address:           address,
			
 
				+    description:       description,
			
 
				+    applicant:         applicant,
			
 
				+    on_notice_to:      on_notice_to,
			
 
				+    on_notice_to_raw:  closes_raw,
			
 
				+    document_url:      document_url
			
 
				   }
			
 
				 end
			
 
				 
			
 
				-list_html = Http.get(URL)
			
 
				-list_doc  = Nokogiri::HTML(list_html)
			
 
				-
			
 
				-detail_links = list_doc.css("article h2 a, .entry-content a").map { |a|
			
 
				-  href = a["href"].to_s
			
 
				-  next if href.strip.empty? || href.start_with?("#")
			
 
				-  abs_url(URL, href)
			
 
				-}.compact.uniq
			
 
				-
			
 
				-puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
			
 
				+puts "Found #{items.length} item(s) for #{TABLE}"
			
 
				 
			
 
				-saved = 0
			
 
				-
			
 
				-detail_links.each do |u|
			
 
				-  begin
			
 
				-    item = parse_detail(u)
			
 
				-  rescue StandardError => e
			
 
				-    Log.warn "scraper", "Skip #{u}: #{e.class} #{e.message}"
			
 
				-    next
			
 
				-  end
			
 
				-  next unless item
			
 
				+items.each do |r|
			
 
				+  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
			
 
				 
			
 
				   upsert_and_enrich!(
			
 
				     table: TABLE,
			
 
				     row: {
			
 
				-      description: item[:description],
			
 
				-      date_received: item[:date_received],
			
 
				-      date_received_raw: item[:date_received_raw],
			
 
				-      address: item[:address],
			
 
				-      council_reference: item[:council_reference],
			
 
				-      applicant: "",
			
 
				-      owner: ""
			
 
				+      council_reference: r[:council_reference],
			
 
				+      address:           r[:address],
			
 
				+      description:       r[:description],
			
 
				+      applicant:         r[:applicant],
			
 
				+      on_notice_to:      r[:on_notice_to],
			
 
				+      on_notice_to_raw:  r[:on_notice_to_raw],
			
 
				+      owner:             ""
			
 
				     },
			
 
				     extras: {
			
 
				-      document_url:    item[:document_url],
			
 
				-      on_notice_to:    item[:date_received],
			
 
				-      on_notice_to_raw: item[:date_received_raw],
			
 
				-      title_reference: item[:title_reference]
			
 
				+      document_url:       r[:document_url],
			
 
				+      local_document_url: local_url
			
 
				     }
			
 
				   )
			
 
				-  saved += 1
			
 
				 end
			
 
				 
			
 
				-puts "Done #{TABLE}. Saved #{saved} item(s)."
			
 
				+puts "Done #{TABLE}. Saved #{items.length} item(s)."