| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
- # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
- require "nokogiri"
- require "cgi"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/enrich"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
- START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
- DB.ensure_table!(TABLE)
- # Optional: keep the SharePoint link
- begin
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
- rescue StandardError => e
- warn "document_url add skipped: #{e.class} #{e.message}"
- end
- REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
- def abs_url(base, href)
- return "" if href.to_s.strip.empty?
- URI.join(base, href).to_s rescue href.to_s
- end
- def nearest_heading_text(node)
- h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
- h ? h.text.strip : ""
- end
- def proposal_between_heading_and(node)
- # Walk back to the nearest heading, then take the first non-empty text sibling after it
- h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
- return "" unless h
- sib = h
- 12.times do
- sib = sib.next_element
- break if sib.nil?
- t = sib.text.strip.gsub(/\s+/, " ")
- next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i)
- return t
- end
- ""
- end
- def parse_page(html, base_url)
- doc = Nokogiri::HTML(html)
- # Each application has a SharePoint doc link labeled “Copy of application for viewing”
- anchors = doc.css("a").select { |a|
- href = a["href"].to_s
- a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i)
- }
- rows = []
- anchors.each do |a|
- document_url = abs_url(base_url, a["href"])
- heading = nearest_heading_text(a)
- ref = heading[/#{REF_RX}/]&.strip || ""
- # Get a one-line proposal that appears just after the heading
- description = proposal_between_heading_and(a)
- description = "Development Application" if description.empty?
- # Address sometimes appears in the proposal. If not, keep a readable fallback.
- address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i)
- description
- else
- heading
- end
- next if ref.empty? || address.empty?
- rows << {
- council_reference: ref,
- address: address,
- description: description,
- date_received_raw: "",
- date_received: nil,
- document_url: document_url
- }
- end
- # Find a Next link for pagination
- next_href = nil
- if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" })
- next_href = abs_url(base_url, next_a["href"])
- end
- [rows, next_href]
- end
- saved = 0
- url = START_URL
- seen_refs = {}
- loop do
- begin
- html = Http.get(url)
- rescue StandardError => e
- warn "Failed to fetch #{url}: #{e.class} #{e.message}"
- break
- end
- rows, next_url = parse_page(html, url)
- rows.each do |r|
- # de-dup within a run
- next if seen_refs[[r[:council_reference], r[:address]]]
- seen_refs[[r[:council_reference], r[:address]]] = true
- DB.upsert(TABLE, {
- description: r[:description],
- date_received: r[:date_received],
- date_received_raw: r[:date_received_raw],
- address: r[:address],
- council_reference: r[:council_reference],
- applicant: "",
- owner: ""
- })
-
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- )
- begin
- upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
- upd.execute(r[:document_url], r[:council_reference], r[:address])
- rescue Mysql2::Error => e
- warn "[huonvalley] db update skipped for #{r[:council_reference]}: #{e.message}"
- end
- puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
- saved += 1
- end
- break if next_url.nil? || next_url == url
- url = next_url
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|