benjamin.harris
/
tas_councils


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							# Southern Midlands Council — Advertised Development Applications
# Detail pages use paragraph format: "Location: <addr>\nProposal: DA<ref> - <desc>"
# One item page may contain multiple DA entries.

require "nokogiri"
require "uri"
require "cgi"
require_relative "../lib/http"
require_relative "../lib/db"
require_relative "../lib/util"
require_relative "../lib/enrich"
require_relative "../lib/log"

TABLE    = ENV.fetch("TABLE_NAME")  # da_southernmidlands
LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/"

DB.ensure_table!(TABLE)

def abs_url(base, href)
    return "" if href.to_s.strip.empty?
    URI.join(base, href).to_s
rescue URI::InvalidURIError
    href.to_s
end

# ---- fetch list page and collect item links ----
list_html = Http.get(LIST_URL)
list_doc  = Nokogiri::HTML(list_html)

detail_links = list_doc.css("article a[href*='?item='], article h2 a, article h3 a").map { |a|
    href = a["href"].to_s.strip
    next if href.empty? || href.start_with?("#")
    abs_url(LIST_URL, href)
}.compact.uniq

puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"

saved = 0

detail_links.each do |url|
    html = begin
        Http.get(url)
    rescue StandardError => e
        Log.warn "southernmidlands", "Skip #{url}: #{e.class} #{e.message}"
        next
    end

    doc = Nokogiri::HTML(html)

    # Each DA entry is a <p> block containing "Location:" text.
    # One page may have multiple such paragraphs.
    doc.css("p").each do |para|
        # Preserve line breaks from <br> tags before stripping HTML
        inner = para.inner_html.gsub(/<br\s*\/?>/, "\n")
        text  = Nokogiri::HTML.fragment(inner).text.gsub(/\r/, "").strip
        next unless text.match?(/Location:/i)

        lines = text.split("\n").map(&:strip).reject(&:empty?)

        loc_line  = lines.find { |l| l.match?(/\ALocation:/i) }
        prop_line = lines.find { |l| l.match?(/\AProposal:/i) }

        address  = loc_line&.sub(/\ALocation:\s*/i, "")&.strip.to_s
        proposal = prop_line&.sub(/\AProposal:\s*/i, "")&.strip.to_s

        next if address.empty? || proposal.empty?

        # Extract DA reference from proposal line (e.g. "DA2600035 - Dwelling")
        ref_match = proposal.match(/\b(DA\s*[\d\/]+)\b/i)
        council_reference = ref_match ? ref_match[1].gsub(/\s+/, "") : nil
        description = proposal.sub(/\A(DA\s*[\d\/]+)\s*[-:]\s*/i, "").strip

        if council_reference.nil? || council_reference.empty?
            Log.warn "southernmidlands", "No DA ref on #{url} — skipping paragraph"
            next
        end

        # PDF link — check this paragraph then its next sibling
        pdf_href = para.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
        unless pdf_href
            sib = para.next_element
            pdf_href = sib&.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
        end
        document_url = pdf_href ? abs_url(url, pdf_href) : nil

        begin
            DB.upsert(TABLE, {
                description:       description,
                address:           address[0, 255],
                council_reference: council_reference[0, 100],
                document_url:      document_url
            })

            enrich_after_upsert!(
                table:             TABLE,
                council_reference: council_reference,
                address:           address
            )

            Log.info "southernmidlands", "Upserted #{council_reference} -> #{address}"
            saved += 1
        rescue StandardError => e
            Log.warn "southernmidlands", "DB error for #{council_reference}: #{e.class} #{e.message}"
        end
    end
end

puts "Done #{TABLE}. Saved #{saved} item(s)."