|
|
@@ -1,4 +1,6 @@
|
|
|
# Southern Midlands Council — Advertised Development Applications
|
|
|
+# Detail pages use paragraph format: "Location: <addr>\nProposal: DA<ref> - <desc>"
|
|
|
+# One item page may contain multiple DA entries.
|
|
|
|
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
|
@@ -7,86 +9,28 @@ require_relative "../lib/http"
|
|
|
require_relative "../lib/db"
|
|
|
require_relative "../lib/util"
|
|
|
require_relative "../lib/enrich"
|
|
|
+require_relative "../lib/log"
|
|
|
|
|
|
-TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_southernmidlands
|
|
|
+TABLE = ENV.fetch("TABLE_NAME") # da_southernmidlands
|
|
|
LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/"
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
-# Optional extras used on this site
|
|
|
-begin
|
|
|
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
|
|
|
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL")
|
|
|
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL")
|
|
|
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
|
|
|
-rescue StandardError => e
|
|
|
- Log.warn "scraper", "Optional column add skipped: #{e.class} #{e.message}"
|
|
|
-end
|
|
|
-
|
|
|
def abs_url(base, href)
|
|
|
- return "" if href.to_s.strip.empty?
|
|
|
- URI.join(base, href).to_s rescue href.to_s
|
|
|
-end
|
|
|
-
|
|
|
-# Reference forms like "DA 2025/00123", "DA2025/00123"
|
|
|
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i
|
|
|
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
|
|
|
-
|
|
|
-def extract_ref(text)
|
|
|
- s = text.to_s
|
|
|
- if (m = s.match(REF_RX1))
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
- end
|
|
|
- if (m = s.match(REF_RX2))
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
- end
|
|
|
- nil
|
|
|
-end
|
|
|
-
|
|
|
-def extract_date_like(str)
|
|
|
- s = str.to_s
|
|
|
- return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
|
|
|
- return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
|
|
|
- return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
|
|
|
- ""
|
|
|
-end
|
|
|
-
|
|
|
-def extract_on_notice_raw(text)
|
|
|
- s = text.to_s.gsub(/\s+/, " ")
|
|
|
- if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
|
|
|
- t = $2
|
|
|
- d = extract_date_like(t)
|
|
|
- return d unless d.empty?
|
|
|
- end
|
|
|
- if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
|
|
|
- t = $2
|
|
|
- d = extract_date_like(t)
|
|
|
- return d unless d.empty?
|
|
|
- end
|
|
|
- extract_date_like(s)
|
|
|
-end
|
|
|
-
|
|
|
-def first_nonempty_text_after(node, max_hops: 12)
|
|
|
- sib = node
|
|
|
- max_hops.times do
|
|
|
- sib = sib.next_element
|
|
|
- break if sib.nil?
|
|
|
- t = sib.text.to_s.strip.gsub(/\s+/, " ")
|
|
|
- return t unless t.empty?
|
|
|
- end
|
|
|
- ""
|
|
|
+ return "" if href.to_s.strip.empty?
|
|
|
+ URI.join(base, href).to_s
|
|
|
+rescue URI::InvalidURIError
|
|
|
+ href.to_s
|
|
|
end
|
|
|
|
|
|
-# Get all application detail links from the list page
|
|
|
+# ---- fetch list page and collect item links ----
|
|
|
list_html = Http.get(LIST_URL)
|
|
|
list_doc = Nokogiri::HTML(list_html)
|
|
|
|
|
|
-# Southern Midlands lists items as articles or grouped blocks. Collect obvious links.
|
|
|
-detail_links = list_doc.css("article .content h2 a, article h2 a, .entry-content a").map { |a|
|
|
|
- href = a["href"].to_s
|
|
|
- next if href.strip.empty?
|
|
|
- next if href.start_with?("#")
|
|
|
- abs_url(LIST_URL, href)
|
|
|
+detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a|
|
|
|
+ href = a["href"].to_s.strip
|
|
|
+ next if href.empty? || href.start_with?("#")
|
|
|
+ abs_url(LIST_URL, href)
|
|
|
}.compact.uniq
|
|
|
|
|
|
puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
|
|
|
@@ -94,91 +38,71 @@ puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
|
|
|
saved = 0
|
|
|
|
|
|
detail_links.each do |url|
|
|
|
- begin
|
|
|
- html = Http.get(url)
|
|
|
- rescue StandardError => e
|
|
|
- Log.warn "scraper", "Skip #{url}: #{e.class} #{e.message}"
|
|
|
- next
|
|
|
- end
|
|
|
-
|
|
|
- doc = Nokogiri::HTML(html)
|
|
|
-
|
|
|
- # Title often contains address or reference
|
|
|
- title_reference = doc.at_css("h1, .entry-title")&.text&.strip.to_s
|
|
|
-
|
|
|
- # Try to find a details table or labeled rows
|
|
|
- kv = {}
|
|
|
- doc.css("table tr").each do |tr|
|
|
|
- cells = tr.css("th, td")
|
|
|
- next unless cells.length >= 2
|
|
|
- key = cells[0].text.strip
|
|
|
- val = cells[1].text.strip
|
|
|
- kv[key] = val unless key.empty?
|
|
|
- end
|
|
|
-
|
|
|
- find = ->(rx) {
|
|
|
- pair = kv.find { |k, _| k =~ rx }
|
|
|
- pair ? pair[1] : ""
|
|
|
- }
|
|
|
-
|
|
|
- # Fields by label when present
|
|
|
- council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
|
|
|
- address = find.call(/(Address|Location|Property)/i)
|
|
|
- description = find.call(/(Proposal|Description)/i)
|
|
|
- on_notice_raw = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
|
|
|
-
|
|
|
- # Fallbacks from free text around the title
|
|
|
- if council_reference.to_s.strip.empty?
|
|
|
- council_reference = extract_ref(title_reference) || extract_ref(doc.text)
|
|
|
- end
|
|
|
- address = title_reference if address.to_s.strip.empty?
|
|
|
- if description.to_s.strip.empty?
|
|
|
- # Take the first non-empty paragraph after the title
|
|
|
- h = doc.at_css("h1, .entry-title")
|
|
|
- description = if h then first_nonempty_text_after(h) else "" end
|
|
|
- description = "Development Application" if description.empty?
|
|
|
- end
|
|
|
- if on_notice_raw.to_s.strip.empty?
|
|
|
- on_notice_raw = extract_on_notice_raw(doc.text)
|
|
|
- end
|
|
|
-
|
|
|
- on_notice = Util.parse_aus_date(on_notice_raw)
|
|
|
-
|
|
|
- # Grab a PDF link if present
|
|
|
- pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
|
|
|
- document_url = pdf ? abs_url(url, pdf) : ""
|
|
|
-
|
|
|
- # Minimal required fields
|
|
|
- council_reference = council_reference.to_s.strip
|
|
|
- address = address.to_s.strip
|
|
|
- next if council_reference.empty? || address.empty?
|
|
|
-
|
|
|
- # Store on_notice in the DATE column for consistency with your other site scrapers
|
|
|
- DB.upsert(TABLE, {
|
|
|
- description: description,
|
|
|
- date_received: on_notice,
|
|
|
- date_received_raw: on_notice_raw.to_s,
|
|
|
- address: address,
|
|
|
- council_reference: council_reference,
|
|
|
- applicant: "",
|
|
|
- owner: ""
|
|
|
- })
|
|
|
-
|
|
|
- enrich_after_upsert!(
|
|
|
- table: TABLE,
|
|
|
- council_reference: council_reference,
|
|
|
- address: address
|
|
|
- )
|
|
|
-
|
|
|
- begin
|
|
|
- upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
|
|
|
- upd.execute(document_url, on_notice, on_notice_raw.to_s, title_reference, council_reference, address)
|
|
|
- rescue StandardError => e
|
|
|
- Log.warn "scraper", "Extras update skipped for #{council_reference}: #{e.class} #{e.message}"
|
|
|
- end
|
|
|
-
|
|
|
- puts "Upserted #{council_reference} -> #{address}"
|
|
|
- saved += 1
|
|
|
+ html = begin
|
|
|
+ Http.get(url)
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}"
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ doc = Nokogiri::HTML(html)
|
|
|
+
|
|
|
+ # Each DA entry is a <p> block containing "Location:" text.
|
|
|
+ # One page may have multiple such paragraphs.
|
|
|
+ doc.css("p").each do |para|
|
|
|
+ # Preserve line breaks from <br> tags before stripping HTML
|
|
|
+ inner = para.inner_html.gsub(/<br\s*\/?>/, "\n")
|
|
|
+ text = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip
|
|
|
+ next unless text.match?(/Location:/i)
|
|
|
+
|
|
|
+ lines = text.split("\n").map(&:strip).reject(&:empty?)
|
|
|
+
|
|
|
+ loc_line = lines.find { |l| l.match?(/\ALocation:/i) }
|
|
|
+ prop_line = lines.find { |l| l.match?(/\AProposal:/i) }
|
|
|
+
|
|
|
+ address = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s
|
|
|
+ proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s
|
|
|
+
|
|
|
+ next if address.empty? || proposal.empty?
|
|
|
+
|
|
|
+ # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling")
|
|
|
+ ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i)
|
|
|
+ council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil
|
|
|
+ description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip
|
|
|
+
|
|
|
+ if council_reference.nil? || council_reference.empty?
|
|
|
+ Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph"
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ # PDF link — check this paragraph then its next sibling
|
|
|
+ pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
|
|
|
+ unless pdf_href
|
|
|
+ sib = para.next_element
|
|
|
+ pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
|
|
|
+ end
|
|
|
+ document_url = pdf_href ? abs_url(url, pdf_href) : nil
|
|
|
+
|
|
|
+ begin
|
|
|
+ DB.upsert(TABLE, {
|
|
|
+ description: description,
|
|
|
+ address: address[0, 255],
|
|
|
+ council_reference: council_reference[0, 100],
|
|
|
+ document_url: document_url
|
|
|
+ })
|
|
|
+
|
|
|
+ enrich_after_upsert!(
|
|
|
+ table: TABLE,
|
|
|
+ council_reference: council_reference,
|
|
|
+ address: address
|
|
|
+ )
|
|
|
+
|
|
|
+ Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}"
|
|
|
+ saved += 1
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}"
|
|
|
+ end
|
|
|
+ end
|
|
|
end
|
|
|
|
|
|
puts "Done #{TABLE}. Saved #{saved} item(s)."
|