# Kentish Council — Advertised / Planning Applications (site page, not PlanBuild) require "nokogiri" require "uri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_kentish # Set this to the exact page you use for Kentish (from your original file) URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp" DB.ensure_table!(TABLE) def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end # Reference formats like: # DA 2025/00123 # DA2025/00123 # Application No. DA 2025/123 REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i # DA 2025/0123 REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i # DA2025-0123 or DA2025/0123 REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i # DA 114-2025 def extract_ref(str) s = CGI.unescape(str.to_s) if (m = s.match(REF_RX1)) return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX2)) return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX3)) return "DA #{m[2]} / #{m[1]}" end nil end DATE_RX = / (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b| \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b| \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b) /x def extract_on_notice_raw(text) s = text.to_s.gsub(/\s+/, " ") if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i)) if (d = m[2].match(DATE_RX)) return d[1] end end if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i)) if (d = m[2].match(DATE_RX)) return d[1] end end if (d = s.match(DATE_RX)) return d[1] end "" end def first_meaningful_text(node) return "" unless node t = node.text.to_s.strip.gsub(/\s+/, " ") t end def nearest_context_text(a) host = a.ancestors("li, p, div, tr").first || a.parent first_meaningful_text(host) end def parse_document_list(doc, base_url) # Look for clear “items”: pdf links, or list/table rows containing one anchors = doc.css("a").select { |a| href = a["href"].to_s a.text.to_s.strip.match?(/application|permit|advertis/i) || href.downcase.end_with?(".pdf") } rows = [] anchors.each do |a| href = a["href"].to_s pdf = abs_url(base_url, href) ctx = nearest_context_text(a) link_text = a.text.to_s.strip text_for_parse = [link_text, ctx].uniq.join(" — ") # Try to pull fields ref = extract_ref(text_for_parse) addr = if link_text.length > 6 link_text else ctx[0, 140] end on_raw = extract_on_notice_raw(text_for_parse) on_dt = Util.parse_aus_date(on_raw) desc = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i $1.strip else "Development Application" end next if ref.nil? || addr.to_s.strip.empty? rows << { council_reference: ref, address: addr.to_s.strip, description: desc, date_received: on_dt, date_received_raw: on_raw, document_url: pdf } end rows end begin html = Http.get(URL) rescue => e warn "Failed to fetch #{URL}: #{e.class} #{e.message}" exit 1 end doc = Nokogiri::HTML(html) items = parse_document_list(doc, URL) puts "Found #{items.length} item(s) for #{TABLE}" items.each do |r| DB.upsert(TABLE, { description: r[:description], date_received: r[:date_received], date_received_raw: r[:date_received_raw], address: r[:address], council_reference: r[:council_reference], applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ? WHERE council_reference = ? AND address = ?") upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:council_reference], r[:address]) rescue => e warn "Extras update skipped for #{r[:council_reference]}: #{e.class} #{e.message}" end puts "Upserted #{r[:council_reference]} -> #{r[:address]}" end puts "Done #{TABLE}."