| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- # Southern Midlands Council — Advertised Development Applications
- # Detail pages use paragraph format: "Location: <addr>\nProposal: DA<ref> - <desc>"
- # One item page may contain multiple DA entries.
- require "nokogiri"
- require "uri"
- require "cgi"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- TABLE = ENV.fetch("TABLE_NAME") # da_southernmidlands
- LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/"
- DB.ensure_table!(TABLE)
- def abs_url(base, href)
- return "" if href.to_s.strip.empty?
- URI.join(base, href).to_s
- rescue URI::InvalidURIError
- href.to_s
- end
- # ---- fetch list page and collect item links ----
- list_html = Http.get(LIST_URL)
- list_doc = Nokogiri::HTML(list_html)
- detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a|
- href = a["href"].to_s.strip
- next if href.empty? || href.start_with?("#")
- abs_url(LIST_URL, href)
- }.compact.uniq
- puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
- saved = 0
- detail_links.each do |url|
- html = begin
- Http.get(url)
- rescue StandardError => e
- Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}"
- next
- end
- doc = Nokogiri::HTML(html)
- # Each DA entry is a <p> block containing "Location:" text.
- # One page may have multiple such paragraphs.
- doc.css("p").each do |para|
- # Preserve line breaks from <br> tags before stripping HTML
- inner = para.inner_html.gsub(/<br\s*\/?>/, "\n")
- text = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip
- next unless text.match?(/Location:/i)
- lines = text.split("\n").map(&:strip).reject(&:empty?)
- loc_line = lines.find { |l| l.match?(/\ALocation:/i) }
- prop_line = lines.find { |l| l.match?(/\AProposal:/i) }
- address = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s
- proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s
- next if address.empty? || proposal.empty?
- # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling")
- ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i)
- council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil
- description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip
- if council_reference.nil? || council_reference.empty?
- Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph"
- next
- end
- # PDF link — check this paragraph then its next sibling
- pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
- unless pdf_href
- sib = para.next_element
- pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
- end
- document_url = pdf_href ? abs_url(url, pdf_href) : nil
- begin
- DB.upsert(TABLE, {
- description: description,
- address: address[0, 255],
- council_reference: council_reference[0, 100],
- document_url: document_url
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- )
- Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}"
- saved += 1
- rescue StandardError => e
- Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|