# Huon Valley Council — Advertised Applications (site page, not PlanBuild) # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/ require "nokogiri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/" DB.ensure_table!(TABLE) # Optional: keep the SharePoint link begin DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL") rescue => e warn "document_url add skipped: #{e.class} #{e.message}" end REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end def nearest_heading_text(node) h = node.xpath("preceding::h2[1] | preceding::h3[1]").first h ? h.text.strip : "" end def proposal_between_heading_and(node) # Walk back to the nearest heading, then take the first non-empty text sibling after it h = node.xpath("preceding::h2[1] | preceding::h3[1]").first return "" unless h sib = h 12.times do sib = sib.next_element break if sib.nil? t = sib.text.strip.gsub(/\s+/, " ") next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i) return t end "" end def parse_page(html, base_url) doc = Nokogiri::HTML(html) # Each application has a SharePoint doc link labeled “Copy of application for viewing” anchors = doc.css("a").select { |a| href = a["href"].to_s a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i) } rows = [] anchors.each do |a| document_url = abs_url(base_url, a["href"]) heading = nearest_heading_text(a) ref = heading[/#{REF_RX}/]&.strip || "" # Get a one-line proposal that appears just after the heading description = proposal_between_heading_and(a) description = "Development Application" if description.empty? # Address sometimes appears in the proposal. If not, keep a readable fallback. address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i) description else heading end next if ref.empty? || address.empty? rows << { council_reference: ref, address: address, description: description, date_received_raw: "", date_received: nil, document_url: document_url } end # Find a Next link for pagination next_href = nil if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" }) next_href = abs_url(base_url, next_a["href"]) end [rows, next_href] end saved = 0 url = START_URL seen_refs = {} loop do begin html = Http.get(url) rescue => e warn "Failed to fetch #{url}: #{e.class} #{e.message}" break end rows, next_url = parse_page(html, url) rows.each do |r| # de-dup within a run next if seen_refs[[r[:council_reference], r[:address]]] seen_refs[[r[:council_reference], r[:address]]] = true DB.upsert(TABLE, { description: r[:description], date_received: r[:date_received], date_received_raw: r[:date_received_raw], address: r[:address], council_reference: r[:council_reference], applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?") upd.execute(r[:document_url], r[:council_reference], r[:address]) rescue Mysql2::Error => e warn "[huonvalley] db update skipped for #{r[:council_reference]}: #{e.message}" end puts "Upserted #{r[:council_reference]} -> #{r[:address]}" saved += 1 end break if next_url.nil? || next_url == url url = next_url end puts "Done #{TABLE}. Saved #{saved} item(s)."