# Northern Midlands Council — Advertised / Planning Applications (site page) require "nokogiri" require "uri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_northernmidlands URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2" DB.ensure_table!(TABLE) # Optional extras begin DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL") DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL") rescue => e warn "Optional column add skipped: #{e.class} #{e.message}" end def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end # “DA 2025/00123”, “DA2025/00123”, “Application No. DA 2025/123” REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i def extract_ref(str) s = CGI.unescape(str.to_s) if (m = s.match(REF_RX1)) return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX2)) return "DA #{m[1]} / #{m[2]}" end nil end DATE_RX = / (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b| \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b| \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b) /x def extract_on_notice_raw(text) s = text.to_s.gsub(/\s+/, " ") if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i)) if (d = m[2].match(DATE_RX)) return d[1] end end if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i)) if (d = m[2].match(DATE_RX)) return d[1] end end if (d = s.match(DATE_RX)) return d[1] end "" end def nearest_context_text(a) host = a.ancestors("li, p, div, tr, article").first || a.parent host ? host.text.to_s.strip.gsub(/\s+/, " ") : "" end def parse_items(doc, base_url) rows = [] # 1) Obvious list items or rows with PDFs or application keywords anchors = doc.css("a").select { |a| href = a["href"].to_s a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf") } anchors.each do |a| href = a["href"].to_s link_text = a.text.to_s.strip document_url = abs_url(base_url, href) ctx = nearest_context_text(a) # Title to keep, if present title_reference = link_text.empty? ? ctx[0,200] : link_text text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ") # Address: prefer the link text, else the surrounding text slice address = if link_text.length >= 6 link_text else ctx[0, 140] end # Reference from text or file name ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url)) # On-notice on_raw = extract_on_notice_raw(text_for_parse) on_dt = Util.parse_aus_date(on_raw) # Description description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i $1.strip else "Development Application" end next if ref.nil? || address.to_s.strip.empty? rows << { council_reference: ref, address: address.to_s.strip, description: description, date_received: on_dt, date_received_raw: on_raw, document_url: document_url, title_reference: title_reference } end # 2) If the page uses a two-column details table, pick that up too doc.css("table").each do |t| heads = t.css("th").map { |th| th.text.strip.downcase } next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) } t.css("tr").each do |tr| cells = tr.css("td") next unless cells.length >= 2 row_text = tr.text.to_s.strip.gsub(/\s+/, " ") ref = extract_ref(row_text) addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140] on_raw = extract_on_notice_raw(row_text) on_dt = Util.parse_aus_date(on_raw) next if ref.nil? || addr.to_s.strip.empty? rows << { council_reference: ref, address: addr.to_s.strip, description: "Development Application", date_received: on_dt, date_received_raw: on_raw, document_url: "", title_reference: row_text[0,200] } end end rows.uniq { |r| [r[:council_reference], r[:address]] } end if URL.empty? warn "NORTHERN_MIDLANDS_URL is not set. Example:\n ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh" exit 0 end begin html = if URL.include?("/eservice/") # Some councils use ePathway, which needs a cookie-warmed session Http.dorset_session_get(URL) else Http.get(URL) end rescue => e warn "Failed to fetch #{URL}: #{e.class} #{e.message}" exit 1 end doc = Nokogiri::HTML(html) items = parse_items(doc, URL) puts "Found #{items.length} item(s) for #{TABLE}" items.each do |r| DB.upsert(TABLE, { description: r[:description], date_received: r[:date_received], date_received_raw: r[:date_received_raw], address: r[:address], council_reference: r[:council_reference], applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?") upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:title_reference], r[:council_reference], r[:address]) rescue => e warn "Extras update skipped for #{r[:council_reference]}: #{e.class} #{e.message}" end puts "Upserted #{r[:council_reference]} -> #{r[:address]}" end puts "Done #{TABLE}."