# Southern Midlands Council — Advertised Development Applications # Detail pages use paragraph format: "Location: \nProposal: DA - " # One item page may contain multiple DA entries. require "nokogiri" require "uri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" require_relative "../lib/log" TABLE = ENV.fetch("TABLE_NAME") # da_southernmidlands LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/" DB.ensure_table!(TABLE) def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue URI::InvalidURIError href.to_s end # ---- fetch list page and collect item links ---- list_html = Http.get(LIST_URL) list_doc = Nokogiri::HTML(list_html) detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a| href = a["href"].to_s.strip next if href.empty? || href.start_with?("#") abs_url(LIST_URL, href) }.compact.uniq puts "Found #{detail_links.size} candidate link(s) for #{TABLE}" saved = 0 detail_links.each do |url| html = begin Http.get(url) rescue StandardError => e Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}" next end doc = Nokogiri::HTML(html) # Each DA entry is a

block containing "Location:" text. # One page may have multiple such paragraphs. doc.css("p").each do |para| # Preserve line breaks from
tags before stripping HTML inner = para.inner_html.gsub(//, "\n") text = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip next unless text.match?(/Location:/i) lines = text.split("\n").map(&:strip).reject(&:empty?) loc_line = lines.find { |l| l.match?(/\ALocation:/i) } prop_line = lines.find { |l| l.match?(/\AProposal:/i) } address = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s next if address.empty? || proposal.empty? # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling") ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i) council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip if council_reference.nil? || council_reference.empty? Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph" next end # PDF link — check this paragraph then its next sibling pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href") unless pdf_href sib = para.next_element pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href") end document_url = pdf_href ? abs_url(url, pdf_href) : nil begin DB.upsert(TABLE, { description: description, address: address[0, 255], council_reference: council_reference[0, 100], document_url: document_url }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}" saved += 1 rescue StandardError => e Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}" end end end puts "Done #{TABLE}. Saved #{saved} item(s)."