Ver código fonte

updated scrapers

Benjamin Harris 2 meses atrás
pai
commit
cee1e18745

+ 7 - 1
.claude/settings.local.json

@@ -22,7 +22,13 @@
       "Bash(wc -l /f/GIT_REPO/tas_councils/lib/*.rb /f/GIT_REPO/tas_councils/scrapers/*.rb)",
       "Bash(grep -n \"def.*$\" /f/GIT_REPO/tas_councils/lib/util.rb)",
       "Bash(grep -l \"require.*log\\\\b\" f:/GIT_REPO/tas_councils/scrapers/*.rb)",
-      "Bash(grep -l \"require.*scraper_helpers\\\\b\" f:/GIT_REPO/tas_councils/scrapers/*.rb)"
+      "Bash(grep -l \"require.*scraper_helpers\\\\b\" f:/GIT_REPO/tas_councils/scrapers/*.rb)",
+      "Bash(grep -l \"Log\\\\.\" *.rb)",
+      "Bash(grep -l 'require_relative.*lib/log' *.rb)",
+      "Bash(grep -l \"upsert_and_enrich!\" *.rb)",
+      "Bash(grep -l \"def abs_url\" *.rb)",
+      "WebFetch(domain:www.southernmidlands.tas.gov.au)",
+      "Bash(python3 -)"
     ]
   }
 }

+ 29 - 0
VERSIONS.md

@@ -5,6 +5,35 @@ Entries are grouped by push/session in reverse-chronological order.
 
 ---
 
+## 2026-04-13 — Scraper Fixes & Audit
+
+**`scrapers/planbuild.rb`** — rewrote to fix crash on first item:
+- Added missing `require "zlib"`, `require "stringio"`, `require_relative "../lib/log"`
+- `fetch_detail` now always returns a Hash (`parsed.is_a?(Hash) ? parsed : {}`); bare `rescue {}` replaced with `rescue JSON::ParserError, Zlib::Error`
+- Removed debug `puts` — replaced with `Log.debug`/`Log.info`
+- `local_document_url` now passes `nil` (not `""`) when no downloads — prevents COALESCE overwriting an existing URL with empty string
+- Per-item rescue so one bad reference skips and logs rather than killing the run
+
+**`scrapers/southernmidlands.rb`** — rewrote detail page parser:
+- Detail pages use `Location: / Proposal:` paragraph format, not table rows — old `table tr th/td` selector found nothing, causing 0 saves
+- New parser splits `<br>`-separated lines per paragraph, extracts Location/Proposal fields, handles multiple DAs per item page
+- Removed redundant `ALTER TABLE` block (columns already in `DB.ensure_table!`)
+- Added explicit `require_relative "../lib/http"`, `../lib/db"`, `"../lib/util"`
+
+**Missing `require_relative "../lib/log"` — 20 scrapers fixed:**
+- `break_oday`, `brighton`, `burnie`, `centralcoast`, `circularhead`, `clarence`, `derwentvalley`, `devonportcity`, `dorset`, `flinders_council`, `glenorchy`, `huonvalley`, `kentish`, `launcestoncity`, `meandervalley`, `northernmidlands`, `southernmidlands`, `waratah_wynyard`, `westcoast`, `westtamar`
+- `Log.warn` called in rescue blocks in all of these — without the require, the first error would raise `NameError: uninitialized constant Log` instead of logging
+
+**`enrich_after_upsert!` variable scope bugs — 4 scrapers fixed:**
+- `flinders_council.rb`: `council_reference` (undefined) → `ref`; folded separate `UPDATE document_url` into `DB.upsert`; removed redundant `ALTER TABLE`
+- `huonvalley.rb`: `council_reference`/`address` (undefined) → `r[:council_reference]`/`r[:address]`; folded `UPDATE document_url` into upsert; removed redundant `ALTER TABLE`
+- `kentish.rb`: `council_reference`/`address` (undefined) → `r[:council_reference]`/`r[:address]`; folded extras UPDATE into upsert
+- `westcoast.rb`: `address` (undefined) → `item[:address]`; fixed upsert field names (`on_notice` → `on_notice_to`, `on_notice_raw` → `on_notice_to_raw`); fixed values referencing non-existent item keys; folded extras UPDATE into upsert
+
+**Redundant `ALTER TABLE` blocks removed** from `circularhead.rb` and `waratah_wynyard.rb` — all columns already created by `DB.ensure_table!`
+
+---
+
 ## 2026-04-13 — Code Quality Pass 3
 
 **Logging**

+ 2 - 5
scrapers/break_oday.rb

@@ -4,12 +4,9 @@
 require "nokogiri"
 require "cgi"
 require "uri"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_break_oday
 URL   = "https://www.bodc.tas.gov.au/council/advertised-development-applications/"
 

+ 2 - 5
scrapers/brighton.rb

@@ -3,12 +3,9 @@ require "date"
 require "nokogiri"
 require "cgi"
 require "fileutils"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets this from filename: da_brighton
 URL   = "https://www.brighton.tas.gov.au/planning/advertised-development-applications/"
 

+ 2 - 5
scrapers/burnie.rb

@@ -11,12 +11,9 @@ require "stringio"
 require "base64"
 require "securerandom"
 
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE    = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_burnie
 BASE_URL = "https://www.burnie.tas.gov.au"
 URL      = "#{BASE_URL}/Development/Planning/Permit-applications-on-exhibition"

+ 2 - 5
scrapers/centralcoast.rb

@@ -5,12 +5,9 @@ require "date"
 require "nokogiri"
 require "cgi"
 require "fileutils"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_centralcoast
 URL   = "https://www.centralcoast.tas.gov.au/current-planning-applications/"
 

+ 2 - 12
scrapers/circularhead.rb

@@ -1,24 +1,14 @@
 # Circular Head Council — Planning page list scraper
 
 require "nokogiri"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_circularhead
 URL   = "https://www.circularhead.tas.gov.au/council-services/development/planning"
 
 DB.ensure_table!(TABLE)
 
-# Optional columns for extras
-begin
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
-rescue StandardError => e
-  Log.warn "scraper", "Optional column add skipped: #{e.class} #{e.message}"
-end
-
 def abs_url(base, href)
   return "" if href.to_s.strip.empty?
   URI.join(base, href).to_s rescue href.to_s

+ 2 - 5
scrapers/clarence.rb

@@ -6,12 +6,9 @@ require "cgi"
 require "uri"
 require "date"
 require "fileutils"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_clarence
 URL   = "https://www.ccc.tas.gov.au/development/advertised-plans/"
 

+ 2 - 3
scrapers/derwentvalley.rb

@@ -3,10 +3,9 @@
 # Fallback list (Public Notice posts): https://www.derwentvalley.tas.gov.au/home/latest-news?f.News+category%7CnewsCategory=Public+Notice
 
 require "nokogiri"
-require_relative "../lib/http"
-require_relative "../lib/util"
-require_relative "../lib/scraper_helpers"
 
+require_relative "../lib/scraper_helpers"
+require_relative "../lib/log"
 TABLE        = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_derwentvalley
 LIST_URL     = "https://www.derwentvalley.tas.gov.au/home/card-listing/development-applications"
 NEWS_URL     = "https://www.derwentvalley.tas.gov.au/home/latest-news?f.News+category%7CnewsCategory=Public+Notice"

+ 2 - 5
scrapers/devonportcity.rb

@@ -5,12 +5,9 @@ require "fileutils"
 require "net/http"
 require "uri"
 
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_devonportcity
 URL   = "https://www.devonport.tas.gov.au/building-development/planning/advertised-planning-permit-applications/"
 

+ 2 - 5
scrapers/dorset.rb

@@ -3,12 +3,9 @@ require "date"
 require "nokogiri"
 require "uri"
 require "fileutils"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")
 BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
 BASE_HTTP  = "http://eservices.dorset.tas.gov.au"

+ 12 - 28
scrapers/flinders_council.rb

@@ -2,23 +2,14 @@
 
 require "nokogiri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets this from filename: da_flinders
 URL   = "https://www.flinders.tas.gov.au/current-advertising"
 
 DB.ensure_table!(TABLE)
 
-# Optional column to keep the PDF link
-begin
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL")
-rescue StandardError => e
-  Log.warn "scraper", "document_url add skipped: #{e.class} #{e.message}"
-end
-
 def abs_url(base, href)
   return "" if href.to_s.strip.empty?
   URI.join(base, href).to_s rescue href.to_s
@@ -70,28 +61,21 @@ links.each do |a|
   next if address.empty? || ref.nil?
 
   DB.upsert(TABLE, {
-    description: description,
-    date_received: date_received,
+    description:       description,
+    date_received:     date_received,
     date_received_raw: date_received_raw,
-    address: address,
+    address:           address,
     council_reference: ref,
-    applicant: "",
-    owner: ""
+    document_url:      pdf,
+    applicant:         "",
+    owner:             ""
   })
-  
+
   enrich_after_upsert!(
-    table: TABLE,
-    council_reference: council_reference,
-    address: address
+    table:             TABLE,
+    council_reference: ref,
+    address:           address
   )
-  
-
-  begin
-    upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
-    upd.execute(pdf, ref, address)
-  rescue Mysql2::Error => e
-    Log.warn "scraper", "[flinders] db update skipped for #{ref}: #{e.message}"
-  end
 
   puts "Upserted #{ref} -> #{address}"
   saved += 1

+ 2 - 4
scrapers/glenorchy.rb

@@ -2,12 +2,10 @@
 
 require "nokogiri"
 require "date"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_glenorchy
 URL   = "https://www.gcc.tas.gov.au/services/planning-and-building/planning-and-development/planning-applications/"
 

+ 14 - 29
scrapers/huonvalley.rb

@@ -3,23 +3,14 @@
 
 require "nokogiri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_huonvalley
 START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
 
 DB.ensure_table!(TABLE)
 
-# Optional: keep the SharePoint link
-begin
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
-rescue StandardError => e
-  Log.warn "scraper", "document_url add skipped: #{e.class} #{e.message}"
-end
-
 REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
 
 def abs_url(base, href)
@@ -115,27 +106,21 @@ loop do
     seen_refs[[r[:council_reference], r[:address]]] = true
 
     DB.upsert(TABLE, {
-      description: r[:description],
-      date_received: r[:date_received],
+      description:       r[:description],
+      date_received:     r[:date_received],
       date_received_raw: r[:date_received_raw],
-      address: r[:address],
+      address:           r[:address],
       council_reference: r[:council_reference],
-      applicant: "",
-      owner: ""
+      document_url:      r[:document_url],
+      applicant:         "",
+      owner:             ""
     })
-	
-	enrich_after_upsert!(
-    table: TABLE,
-    council_reference: council_reference,
-    address: address
-  )
-
-    begin
-      upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
-      upd.execute(r[:document_url], r[:council_reference], r[:address])
-    rescue Mysql2::Error => e
-      Log.warn "scraper", "[huonvalley] db update skipped for #{r[:council_reference]}: #{e.message}"
-    end
+
+    enrich_after_upsert!(
+      table:             TABLE,
+      council_reference: r[:council_reference],
+      address:           r[:address]
+    )
 
     puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
     saved += 1

+ 14 - 20
scrapers/kentish.rb

@@ -3,11 +3,9 @@
 require "nokogiri"
 require "uri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")              # run_all.sh -> da_kentish
 # Set this to the exact page you use for Kentish (from your original file)
 URL   = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"
@@ -142,28 +140,24 @@ puts "Found #{items.length} item(s) for #{TABLE}"
 
 items.each do |r|
   DB.upsert(TABLE, {
-    description: r[:description],
-    date_received: r[:date_received],
+    description:       r[:description],
+    date_received:     r[:date_received],
     date_received_raw: r[:date_received_raw],
-    address: r[:address],
+    on_notice_to:      r[:date_received],
+    on_notice_to_raw:  r[:date_received_raw],
+    address:           r[:address],
     council_reference: r[:council_reference],
-    applicant: "",
-    owner: ""
+    document_url:      r[:document_url],
+    applicant:         "",
+    owner:             ""
   })
-  
+
   enrich_after_upsert!(
-    table: TABLE,
-    council_reference: council_reference,
-    address: address
+    table:             TABLE,
+    council_reference: r[:council_reference],
+    address:           r[:address]
   )
 
-  begin
-    upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ? WHERE council_reference = ? AND address = ?")
-    upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:council_reference], r[:address])
-  rescue StandardError => e
-    Log.warn "scraper", "Extras update skipped for #{r[:council_reference]}: #{e.class} #{e.message}"
-  end
-
   puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
 end
 

+ 2 - 4
scrapers/launcestoncity.rb

@@ -4,11 +4,9 @@ require "uri"
 require "fileutils"
 require "json"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE        = ENV.fetch("TABLE_NAME")
 BASE_URL     = "https://onlineservice.launceston.tas.gov.au"
 URL          = ENV.fetch(

+ 2 - 5
scrapers/meandervalley.rb

@@ -5,13 +5,10 @@ require "nokogiri"
 require "uri"
 require "cgi"
 require "date"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
 require "json"
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 DEBUG  = ENV["DEBUG"] == "1"
 DRY_RUN = ENV["DRY_RUN"] == "1"
 

+ 2 - 3
scrapers/northernmidlands.rb

@@ -3,10 +3,9 @@
 require "nokogiri"
 require "uri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/util"
-require_relative "../lib/scraper_helpers"
 
+require_relative "../lib/scraper_helpers"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")                    # run_all.sh -> da_northernmidlands
 URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
 

+ 78 - 154
scrapers/southernmidlands.rb

@@ -1,4 +1,6 @@
 # Southern Midlands Council — Advertised Development Applications
+# Detail pages use paragraph format: "Location: <addr>\nProposal: DA<ref> - <desc>"
+# One item page may contain multiple DA entries.
 
 require "nokogiri"
 require "uri"
@@ -7,86 +9,28 @@ require_relative "../lib/http"
 require_relative "../lib/db"
 require_relative "../lib/util"
 require_relative "../lib/enrich"
+require_relative "../lib/log"
 
-TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_southernmidlands
+TABLE    = ENV.fetch("TABLE_NAME")  # da_southernmidlands
 LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/"
 
 DB.ensure_table!(TABLE)
 
-# Optional extras used on this site
-begin
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
-rescue StandardError => e
-  Log.warn "scraper", "Optional column add skipped: #{e.class} #{e.message}"
-end
-
 def abs_url(base, href)
-  return "" if href.to_s.strip.empty?
-  URI.join(base, href).to_s rescue href.to_s
-end
-
-# Reference forms like "DA 2025/00123", "DA2025/00123"
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
-
-def extract_ref(text)
-  s = text.to_s
-  if (m = s.match(REF_RX1))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  if (m = s.match(REF_RX2))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  nil
-end
-
-def extract_date_like(str)
-  s = str.to_s
-  return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
-  return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
-  return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
-  ""
-end
-
-def extract_on_notice_raw(text)
-  s = text.to_s.gsub(/\s+/, " ")
-  if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
-    t = $2
-    d = extract_date_like(t)
-    return d unless d.empty?
-  end
-  if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
-    t = $2
-    d = extract_date_like(t)
-    return d unless d.empty?
-  end
-  extract_date_like(s)
-end
-
-def first_nonempty_text_after(node, max_hops: 12)
-  sib = node
-  max_hops.times do
-    sib = sib.next_element
-    break if sib.nil?
-    t = sib.text.to_s.strip.gsub(/\s+/, " ")
-    return t unless t.empty?
-  end
-  ""
+    return "" if href.to_s.strip.empty?
+    URI.join(base, href).to_s
+rescue URI::InvalidURIError
+    href.to_s
 end
 
-# Get all application detail links from the list page
+# ---- fetch list page and collect item links ----
 list_html = Http.get(LIST_URL)
 list_doc  = Nokogiri::HTML(list_html)
 
-# Southern Midlands lists items as articles or grouped blocks. Collect obvious links.
-detail_links = list_doc.css("article .content h2 a, article h2 a, .entry-content a").map { |a|
-  href = a["href"].to_s
-  next if href.strip.empty?
-  next if href.start_with?("#")
-  abs_url(LIST_URL, href)
+detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a|
+    href = a["href"].to_s.strip
+    next if href.empty? || href.start_with?("#")
+    abs_url(LIST_URL, href)
 }.compact.uniq
 
 puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
@@ -94,91 +38,71 @@ puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
 saved = 0
 
 detail_links.each do |url|
-  begin
-    html = Http.get(url)
-  rescue StandardError => e
-    Log.warn "scraper", "Skip #{url}: #{e.class} #{e.message}"
-    next
-  end
-
-  doc = Nokogiri::HTML(html)
-
-  # Title often contains address or reference
-  title_reference = doc.at_css("h1, .entry-title")&.text&.strip.to_s
-
-  # Try to find a details table or labeled rows
-  kv = {}
-  doc.css("table tr").each do |tr|
-    cells = tr.css("th, td")
-    next unless cells.length >= 2
-    key = cells[0].text.strip
-    val = cells[1].text.strip
-    kv[key] = val unless key.empty?
-  end
-
-  find = ->(rx) {
-    pair = kv.find { |k, _| k =~ rx }
-    pair ? pair[1] : ""
-  }
-
-  # Fields by label when present
-  council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
-  address           = find.call(/(Address|Location|Property)/i)
-  description       = find.call(/(Proposal|Description)/i)
-  on_notice_raw     = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
-
-  # Fallbacks from free text around the title
-  if council_reference.to_s.strip.empty?
-    council_reference = extract_ref(title_reference) || extract_ref(doc.text)
-  end
-  address = title_reference if address.to_s.strip.empty?
-  if description.to_s.strip.empty?
-    # Take the first non-empty paragraph after the title
-    h = doc.at_css("h1, .entry-title")
-    description = if h then first_nonempty_text_after(h) else "" end
-    description = "Development Application" if description.empty?
-  end
-  if on_notice_raw.to_s.strip.empty?
-    on_notice_raw = extract_on_notice_raw(doc.text)
-  end
-
-  on_notice = Util.parse_aus_date(on_notice_raw)
-
-  # Grab a PDF link if present
-  pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
-  document_url = pdf ? abs_url(url, pdf) : ""
-
-  # Minimal required fields
-  council_reference = council_reference.to_s.strip
-  address           = address.to_s.strip
-  next if council_reference.empty? || address.empty?
-
-  # Store on_notice in the DATE column for consistency with your other site scrapers
-  DB.upsert(TABLE, {
-    description: description,
-    date_received: on_notice,
-    date_received_raw: on_notice_raw.to_s,
-    address: address,
-    council_reference: council_reference,
-    applicant: "",
-    owner: ""
-  })
-  
-  enrich_after_upsert!(
-    table: TABLE,
-    council_reference: council_reference,
-    address: address
-  )
-
-  begin
-    upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
-    upd.execute(document_url, on_notice, on_notice_raw.to_s, title_reference, council_reference, address)
-  rescue StandardError => e
-    Log.warn "scraper", "Extras update skipped for #{council_reference}: #{e.class} #{e.message}"
-  end
-
-  puts "Upserted #{council_reference} -> #{address}"
-  saved += 1
+    html = begin
+        Http.get(url)
+    rescue StandardError => e
+        Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}"
+        next
+    end
+
+    doc = Nokogiri::HTML(html)
+
+    # Each DA entry is a <p> block containing "Location:" text.
+    # One page may have multiple such paragraphs.
+    doc.css("p").each do |para|
+        # Preserve line breaks from <br> tags before stripping HTML
+        inner = para.inner_html.gsub(/<br\s*\/?>/, "\n")
+        text  = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip
+        next unless text.match?(/Location:/i)
+
+        lines = text.split("\n").map(&:strip).reject(&:empty?)
+
+        loc_line  = lines.find { |l| l.match?(/\ALocation:/i) }
+        prop_line = lines.find { |l| l.match?(/\AProposal:/i) }
+
+        address  = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s
+        proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s
+
+        next if address.empty? || proposal.empty?
+
+        # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling")
+        ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i)
+        council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil
+        description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip
+
+        if council_reference.nil? || council_reference.empty?
+            Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph"
+            next
+        end
+
+        # PDF link — check this paragraph then its next sibling
+        pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
+        unless pdf_href
+            sib = para.next_element
+            pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
+        end
+        document_url = pdf_href ? abs_url(url, pdf_href) : nil
+
+        begin
+            DB.upsert(TABLE, {
+                description:       description,
+                address:           address[0, 255],
+                council_reference: council_reference[0, 100],
+                document_url:      document_url
+            })
+
+            enrich_after_upsert!(
+                table:             TABLE,
+                council_reference: council_reference,
+                address:           address
+            )
+
+            Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}"
+            saved += 1
+        rescue StandardError => e
+            Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}"
+        end
+    end
 end
 
 puts "Done #{TABLE}. Saved #{saved} item(s)."

+ 2 - 14
scrapers/waratah_wynyard.rb

@@ -3,26 +3,14 @@
 require "nokogiri"
 require "uri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE = ENV.fetch("TABLE_NAME")  # da_waratah_wynyard
 URL   = "https://www.warwyn.tas.gov.au/planning-and-development/advertised-permits/"
 
 DB.ensure_table!(TABLE)
 
-# Optional extras
-begin
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL")
-  DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
-rescue StandardError => e
-  Log.warn "scraper", "Optional column add skipped: #{e.class} #{e.message}"
-end
-
 def abs_url(base, href)
   return "" if href.to_s.strip.empty?
   URI.join(base, href).to_s

+ 15 - 23
scrapers/westcoast.rb

@@ -3,12 +3,9 @@
 require "date"
 require "nokogiri"
 require "cgi"
-require_relative "../lib/http"
-require_relative "../lib/db"
-require_relative "../lib/util"
-require_relative "../lib/geocode"
-require_relative "../lib/enrich"
 
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 TABLE    = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_westcoast
 URL = "https://www.westcoast.tas.gov.au/planning-and-development/planning/advertised-development-applications/"
 
@@ -154,30 +151,25 @@ detail_links.each do |u|
   next unless item
 
   DB.upsert(TABLE, {
-    description: item[:description],
-    date_received: date_received,
-	date_received_raw: date_received,
-	on_notice: item[:date_received],       # store close date here to be consistent
-    on_notice_raw: item[:date_received_raw],
-    address: item[:address],
+    description:       item[:description],
+    date_received:     date_received,
+    date_received_raw: date_received.to_s,
+    on_notice_to:      item[:on_notice],
+    on_notice_to_raw:  item[:on_notice_raw],
+    address:           item[:address],
     council_reference: item[:council_reference],
-    applicant: "",
-    owner: ""
+    document_url:      item[:document_url],
+    title_reference:   item[:title_reference],
+    applicant:         "",
+    owner:             ""
   })
-  
+
   enrich_after_upsert!(
-    table: TABLE,
+    table:             TABLE,
     council_reference: item[:council_reference],
-    address: address
+    address:           item[:address]
   )
 
-  begin
-    upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
-    upd.execute(item[:document_url], item[:date_received], item[:date_received_raw], item[:title_reference], item[:council_reference], item[:address])
-  rescue StandardError => e
-    Log.warn "scraper", "Extras update skipped for #{item[:council_reference]}: #{e.class} #{e.message}"
-  end
-
   puts "Upserted #{item[:council_reference]} -> #{item[:address]}"
   saved += 1
 end

+ 2 - 3
scrapers/westtamar.rb

@@ -1,10 +1,9 @@
 # West Tamar Council — Advertised Planning Applications
 
 require "nokogiri"
-require_relative "../lib/http"
-require_relative "../lib/util"
-require_relative "../lib/scraper_helpers"
 
+require_relative "../lib/scraper_helpers"
+require_relative "../lib/log"
 TABLE    = ENV.fetch("TABLE_NAME")   # run_all.sh -> da_westtamar
 URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"