2 months ago · 20a738a15a
--- a/scrapers/westtamar.rb
+++ b/scrapers/westtamar.rb
@@ -1,144 +1,167 @@
 
															 # West Tamar Council — Advertised Planning Applications
														
 
															+#
														
 
															+# Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
														
 
															+#
														
 
															+# Page structure — all entries on one page, grouped by h2 headings:
														
 
															+#
														
 
															+#   <h2>92 Sunset Boulevard, Clarence Point</h2>
														
 
															+#   <p>
														
 
															+#     <strong>APPLICANT:</strong> J & E West<br>
														
 
															+#     <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
														
 
															+#     <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
														
 
															+#     <strong>CLOSES:</strong> 5pm on 16 April 2026
														
 
															+#   </p>
														
 
															+#   <ul>
														
 
															+#     <li>Application Number: PA NO: 2025065</li>
														
 
															+#     <li>Closes 16 April 2026</li>
														
 
															+#   </ul>
														
 
															+#   <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
														
 
															 require "nokogiri"
														
 
															+require "uri"
														
 
															+require "fileutils"
														
 
															 require_relative "../lib/scraper_helpers"
														
 
															 require_relative "../lib/util"
														
 
															 require_relative "../lib/log"
														
 
															-TABLE    = ENV.fetch("TABLE_NAME")   # run_all.sh -> da_westtamar
														
 
															-URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
														
 
															+
														
 
															+TABLE                = ENV.fetch("TABLE_NAME")
														
 
															+URL                  = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
														
 
															+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
														
 
															+DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"
														
 
															 DB.ensure_table!(TABLE)
														
 
															-REF_RX_SLASH  = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
														
 
															-REF_RX_HYPHEN = %r{\bDA\s*(\d{1,4})\s*-\s*(20\d{2})\b}i
														
 
															-def extract_ref(text)
														
 
															-  s = text.to_s
														
 
															-  if (m = s.match(REF_RX_SLASH))
														
 
															-    return "DA #{m[1]} / #{m[2]}"
														
 
															-  end
														
 
															-  if (m = s.match(REF_RX_HYPHEN))
														
 
															-    return "DA #{m[2]} / #{m[1]}"
														
 
															-  end
														
 
															-  if (m = s.match(/\bDA(20\d{2})(\d{3,})\b/i))
														
 
															-    return "DA #{m[1]} / #{m[2]}"
														
 
															-  end
														
 
															+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
														
 
															+
														
 
															+def download_pdf(url, council_reference)
														
 
															+  return nil if url.to_s.strip.empty?
														
 
															+
														
 
															+  dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
														
 
															+  FileUtils.mkdir_p(dir)
														
 
															+
														
 
															+  fname = safe_name(File.basename(URI.parse(url).path))
														
 
															+  fname = "document.pdf" if fname.empty?
														
 
															+  path  = File.join(dir, fname)
														
 
															+
														
 
															+  body = Http.get(url)
														
 
															+  File.binwrite(path, body)
														
 
															+  puts "  saved #{fname} (#{body.bytesize} bytes)"
														
 
															+
														
 
															+  "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
														
 
															+rescue StandardError => e
														
 
															+  Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
														
 
															   nil
														
 
															 end
														
 
															-def extract_date_like(str)
														
 
															-  s = str.to_s
														
 
															-  return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
														
 
															-  return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
														
 
															-  return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
														
 
															-  ""
														
 
															+# Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
														
 
															+def parse_strong_labels(p_node)
														
 
															+  kv = {}
														
 
															+  return kv unless p_node
														
 
															+
														
 
															+  # Replace <br> with newlines so we can split cleanly
														
 
															+  html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
														
 
															+  Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
														
 
															+    line = line.gsub(/\u00a0|\s+/, " ").strip
														
 
															+    next if line.empty?
														
 
															+    if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
														
 
															+      kv[m[1].strip.upcase] = m[2].strip
														
 
															+    end
														
 
															+  end
														
 
															+  kv
														
 
															 end
														
 
															-def extract_on_notice_raw(text)
														
 
															-  s = text.to_s.gsub(/\s+/, " ")
														
 
															-  if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
														
 
															-    d = extract_date_like($2)
														
 
															-    return d unless d.empty?
														
 
															-  end
														
 
															-  if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
														
 
															-    d = extract_date_like($2)
														
 
															-    return d unless d.empty?
														
 
															+html     = Http.get(URL)
														
 
															+doc      = Nokogiri::HTML(html)
														
 
															+items    = []
														
 
															+
														
 
															+# Walk h2 elements; collect their following siblings until the next h2
														
 
															+doc.css("h2").each do |h2|
														
 
															+  sibling_nodes = []
														
 
															+  sib = h2.next_sibling
														
 
															+  while sib
														
 
															+    break if sib.element? && sib.name == "h2"
														
 
															+    sibling_nodes << sib if sib.element?
														
 
															+    sib = sib.next_sibling
														
 
															   end
														
 
															-  extract_date_like(s)
														
 
															-end
														
 
															-def parse_detail(url)
														
 
															-  html = Http.get(url)
														
 
															-  doc  = Nokogiri::HTML(html)
														
 
															+  next if sibling_nodes.empty?
														
 
															-  # Try two-column detail table first
														
 
															-  kv = {}
														
 
															-  doc.css("table tr").each do |tr|
														
 
															-    cells = tr.css("th, td")
														
 
															-    next unless cells.length >= 2
														
 
															-    key = cells[0].text.strip
														
 
															-    val = cells[1].text.strip
														
 
															-    kv[key] = val unless key.empty?
														
 
															-  end
														
 
															+  # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
														
 
															+  label_p  = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
														
 
															+  kv       = parse_strong_labels(label_p)
														
 
															-  find = ->(rx) { kv.find { |k,_| k =~ rx }&.last.to_s.strip }
														
 
															+  # Find the <ul> containing the application number
														
 
															+  ul_node  = sibling_nodes.find { |n| n.name == "ul" }
														
 
															+  ul_text  = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
														
 
															-  council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
														
 
															-  address           = find.call(/(Address|Location|Property)/i)
														
 
															-  description       = find.call(/(Proposal|Description)/i)
														
 
															-  on_notice_raw     = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
														
 
															-  on_notice         = Util.parse_aus_date(on_notice_raw)
														
 
															-  title_reference   = doc.at_css("h1, .entry-title")&.text&.strip.to_s
														
 
															+  # Find the <p> with a PDF link
														
 
															+  pdf_p    = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
														
 
															+  pdf_link = pdf_p&.at_css("a[href]")
														
 
															-  # Fallbacks from page text if labels are missing
														
 
															-  if council_reference.empty?
														
 
															-    council_reference = extract_ref(title_reference) || extract_ref(doc.text)
														
 
															+  # --- Reference: "PA NO: 2025065" from ul, or filename ---
														
 
															+  ref = nil
														
 
															+  if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
														
 
															+    ref = "PA #{m[1]}"
														
 
															   end
														
 
															-  address = title_reference if address.empty?
														
 
															-  description = "Development Application" if description.to_s.strip.empty?
														
 
															-  if on_notice.nil?
														
 
															-    guess = extract_on_notice_raw(doc.text)
														
 
															-    on_notice = Util.parse_aus_date(guess)
														
 
															-    on_notice_raw = guess if on_notice
														
 
															+  if ref.nil? && pdf_link
														
 
															+    href = pdf_link["href"].to_s
														
 
															+    ref  = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
														
 
															   end
														
 
															-
														
 
															-  pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
														
 
															-  document_url = pdf ? abs_url(url, pdf) : ""
														
 
															-
														
 
															-  return nil if council_reference.empty? || address.empty?
														
 
															-
														
 
															-  {
														
 
															-    council_reference: council_reference,
														
 
															-    address: address,
														
 
															-    description: description,
														
 
															-    date_received: on_notice,
														
 
															-    date_received_raw: on_notice_raw.to_s,
														
 
															-    document_url: document_url,
														
 
															-    title_reference: title_reference
														
 
															+  next unless ref
														
 
															+
														
 
															+  # --- Address from LOCATION label, fallback to h2 text ---
														
 
															+  address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
														
 
															+  next if address.empty?
														
 
															+
														
 
															+  # --- Other fields ---
														
 
															+  applicant   = kv["APPLICANT"].to_s
														
 
															+  description = kv["PROPOSAL"].to_s
														
 
															+  description = "Development Application" if description.empty?
														
 
															+
														
 
															+  closes_raw  = kv["CLOSES"].to_s
														
 
															+  # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
														
 
															+  closes_raw  = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
														
 
															+  # Also try list item: "Closes 16 April 2026"
														
 
															+  if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
														
 
															+    closes_raw = m[1]
														
 
															+  end
														
 
															+  on_notice_to = Util.parse_aus_date(closes_raw)
														
 
															+
														
 
															+  document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
														
 
															+
														
 
															+  items << {
														
 
															+    council_reference: ref,
														
 
															+    address:           address,
														
 
															+    description:       description,
														
 
															+    applicant:         applicant,
														
 
															+    on_notice_to:      on_notice_to,
														
 
															+    on_notice_to_raw:  closes_raw,
														
 
															+    document_url:      document_url
														
 
															   }
														
 
															 end
														
 
															-list_html = Http.get(URL)
														
 
															-list_doc  = Nokogiri::HTML(list_html)
														
 
															-
														
 
															-detail_links = list_doc.css("article h2 a, .entry-content a").map { |a|
														
 
															-  href = a["href"].to_s
														
 
															-  next if href.strip.empty? || href.start_with?("#")
														
 
															-  abs_url(URL, href)
														
 
															-}.compact.uniq
														
 
															-
														
 
															-puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
														
 
															+puts "Found #{items.length} item(s) for #{TABLE}"
														
 
															-saved = 0
														
 
															-
														
 
															-detail_links.each do |u|
														
 
															-  begin
														
 
															-    item = parse_detail(u)
														
 
															-  rescue StandardError => e
														
 
															-    Log.warn "scraper", "Skip #{u}: #{e.class} #{e.message}"
														
 
															-    next
														
 
															-  end
														
 
															-  next unless item
														
 
															+items.each do |r|
														
 
															+  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
														
 
															   upsert_and_enrich!(
														
 
															     table: TABLE,
														
 
															     row: {
														
 
															-      description: item[:description],
														
 
															-      date_received: item[:date_received],
														
 
															-      date_received_raw: item[:date_received_raw],
														
 
															-      address: item[:address],
														
 
															-      council_reference: item[:council_reference],
														
 
															-      applicant: "",
														
 
															-      owner: ""
														
 
															+      council_reference: r[:council_reference],
														
 
															+      address:           r[:address],
														
 
															+      description:       r[:description],
														
 
															+      applicant:         r[:applicant],
														
 
															+      on_notice_to:      r[:on_notice_to],
														
 
															+      on_notice_to_raw:  r[:on_notice_to_raw],
														
 
															+      owner:             ""
														
 
															     },
														
 
															     extras: {
														
 
															-      document_url:    item[:document_url],
														
 
															-      on_notice_to:    item[:date_received],
														
 
															-      on_notice_to_raw: item[:date_received_raw],
														
 
															-      title_reference: item[:title_reference]
														
 
															+      document_url:       r[:document_url],
														
 
															+      local_document_url: local_url
														
 
															     }
														
 
															   )
														
 
															-  saved += 1
														
 
															 end
														
 
															-puts "Done #{TABLE}. Saved #{saved} item(s)."
														
 
															+puts "Done #{TABLE}. Saved #{items.length} item(s)."