| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
- # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
- #
- # Page structure per application (flat siblings, no wrapper div):
- # <h2>DA-37/2026</h2>
- # <p>Description, Address (CT-land-title-ref)</p>
- # <h3>More Information</h3>
- # <a href="mapbox...">...</a>
- # <h3>Available Documents:</h3>
- # <a href="sharepoint...">Copy of application for viewing</a>
- require "nokogiri"
- require "uri"
- require "cgi"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
- START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
- DB.ensure_table!(TABLE)
- # DA-37/2026 or DA 37/2026 (number/year order)
- REF_RX = /\bDA[-\s]?\d{1,4}\/20\d{2}\b/i
- def abs_url(base, href)
- return nil if href.to_s.strip.empty?
- URI.join(base, href).to_s
- rescue URI::InvalidURIError
- nil
- end
- def parse_page(html, base_url)
- doc = Nokogiri::HTML(html)
- rows = []
- # Drive from each plain <h2> whose text matches the DA ref pattern
- doc.css("h2").each do |h2|
- ref = h2.text.strip
- next unless ref.match?(REF_RX)
- desc_addr = nil
- document_url = nil
- sib = h2.next_element
- 15.times do
- break if sib.nil?
- # First <p> after the heading holds description + address
- if sib.name == "p" && desc_addr.nil?
- desc_addr = sib.text.strip.gsub(/\s+/, " ")
- end
- # Document link follows <h3>Available Documents:</h3>
- if sib.name == "a" && sib.text.strip.match?(/copy of application for viewing/i)
- document_url = abs_url(base_url, sib["href"])
- break
- end
- # Stop at the next application's <h2>
- break if sib.name == "h2" && sib.text.strip.match?(REF_RX)
- sib = sib.next_element
- end
- next if desc_addr.nil? || desc_addr.empty?
- # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
- # into description and address at the first ", <number> " pattern
- description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
- [m[1].strip, m[2].strip]
- else
- ["Development Application", desc_addr]
- end
- # Strip cadastral reference from end of address: "(CT-237651/1)"
- address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
- next if address.empty?
- rows << {
- council_reference: ref,
- address: address[0, 255],
- description: description,
- date_received_raw: "",
- date_received: nil,
- document_url: document_url
- }
- end
- # Pagination: find a "Next" link
- next_href = nil
- if (next_a = doc.css("a").find { |a| a.text.strip.downcase == "next" })
- next_href = abs_url(base_url, next_a["href"])
- end
- [rows, next_href]
- end
- saved = 0
- url = START_URL
- seen = {}
- loop do
- html = begin
- Http.get(url)
- rescue StandardError => e
- Log.warn "huonvalley", "Failed to fetch #{url}: #{e.class} #{e.message}"
- break
- end
- rows, next_url = parse_page(html, url)
- puts "Found #{rows.length} item(s) on #{url}"
- rows.each do |r|
- key = [r[:council_reference], r[:address]]
- next if seen[key]
- seen[key] = true
- begin
- DB.upsert(TABLE, {
- description: r[:description],
- date_received: r[:date_received],
- date_received_raw: r[:date_received_raw],
- address: r[:address],
- council_reference: r[:council_reference],
- document_url: r[:document_url],
- applicant: "",
- owner: ""
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: r[:council_reference],
- address: r[:address]
- )
- Log.info "huonvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
- saved += 1
- rescue StandardError => e
- Log.warn "huonvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
- end
- end
- break if next_url.nil? || next_url == url
- url = next_url
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|