# Southern Midlands Council — Advertised Development Applications require "nokogiri" require "uri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_southernmidlands LIST_URL = "https://www.southernmidlands.tas.gov.au/advertised-development-applications/" DB.ensure_table!(TABLE) # Optional extras used on this site begin DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL") rescue => e warn "Optional column add skipped: #{e.class} #{e.message}" end def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end # Reference forms like "DA 2025/00123", "DA2025/00123" REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i def extract_ref(text) s = text.to_s if (m = s.match(REF_RX1)) return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX2)) return "DA #{m[1]} / #{m[2]}" end nil end def extract_date_like(str) s = str.to_s return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/ return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/ return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/ "" end def extract_on_notice_raw(text) s = text.to_s.gsub(/\s+/, " ") if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i t = $2 d = extract_date_like(t) return d unless d.empty? end if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i t = $2 d = extract_date_like(t) return d unless d.empty? end extract_date_like(s) end def first_nonempty_text_after(node, max_hops: 12) sib = node max_hops.times do sib = sib.next_element break if sib.nil? t = sib.text.to_s.strip.gsub(/\s+/, " ") return t unless t.empty? end "" end # Get all application detail links from the list page list_html = Http.get(LIST_URL) list_doc = Nokogiri::HTML(list_html) # Southern Midlands lists items as articles or grouped blocks. Collect obvious links. detail_links = list_doc.css("article .content h2 a, article h2 a, .entry-content a").map { |a| href = a["href"].to_s next if href.strip.empty? next if href.start_with?("#") abs_url(LIST_URL, href) }.compact.uniq puts "Found #{detail_links.size} candidate link(s) for #{TABLE}" saved = 0 detail_links.each do |url| begin html = Http.get(url) rescue => e warn "Skip #{url}: #{e.class} #{e.message}" next end doc = Nokogiri::HTML(html) # Title often contains address or reference title_reference = doc.at_css("h1, .entry-title")&.text&.strip.to_s # Try to find a details table or labeled rows kv = {} doc.css("table tr").each do |tr| cells = tr.css("th, td") next unless cells.length >= 2 key = cells[0].text.strip val = cells[1].text.strip kv[key] = val unless key.empty? end find = ->(rx) { pair = kv.find { |k, _| k =~ rx } pair ? pair[1] : "" } # Fields by label when present council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i) address = find.call(/(Address|Location|Property)/i) description = find.call(/(Proposal|Description)/i) on_notice_raw = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i) # Fallbacks from free text around the title if council_reference.to_s.strip.empty? council_reference = extract_ref(title_reference) || extract_ref(doc.text) end address = title_reference if address.to_s.strip.empty? if description.to_s.strip.empty? # Take the first non-empty paragraph after the title h = doc.at_css("h1, .entry-title") description = if h then first_nonempty_text_after(h) else "" end description = "Development Application" if description.empty? end if on_notice_raw.to_s.strip.empty? on_notice_raw = extract_on_notice_raw(doc.text) end on_notice = Util.parse_aus_date(on_notice_raw) # Grab a PDF link if present pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href") document_url = pdf ? abs_url(url, pdf) : "" # Minimal required fields council_reference = council_reference.to_s.strip address = address.to_s.strip next if council_reference.empty? || address.empty? # Store on_notice in the DATE column for consistency with your other site scrapers DB.upsert(TABLE, { description: description, date_received: on_notice, date_received_raw: on_notice_raw.to_s, address: address, council_reference: council_reference, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?") upd.execute(document_url, on_notice, on_notice_raw.to_s, title_reference, council_reference, address) rescue => e warn "Extras update skipped for #{council_reference}: #{e.class} #{e.message}" end puts "Upserted #{council_reference} -> #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."