|
@@ -1,133 +1,148 @@
|
|
|
# Huon Valley Council — Advertised Applications (site page, not PlanBuild)
|
|
# Huon Valley Council — Advertised Applications (site page, not PlanBuild)
|
|
|
# Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
|
|
# Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
|
|
|
|
|
+#
|
|
|
|
|
+# Page structure per application:
|
|
|
|
|
+# <h3 class="application-heading">DA-37/2026</h3>
|
|
|
|
|
+# <h4>Description, Address (land title ref)</h4>
|
|
|
|
|
+# <div class="more-information">
|
|
|
|
|
+# <h3>Available Documents:</h3>
|
|
|
|
|
+# <a href="sharepoint...">Copy of application for viewing</a>
|
|
|
|
|
+# </div>
|
|
|
|
|
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
|
|
+require "uri"
|
|
|
require "cgi"
|
|
require "cgi"
|
|
|
-
|
|
|
|
|
|
|
+require_relative "../lib/http"
|
|
|
|
|
+require_relative "../lib/db"
|
|
|
|
|
+require_relative "../lib/util"
|
|
|
require_relative "../lib/enrich"
|
|
require_relative "../lib/enrich"
|
|
|
require_relative "../lib/log"
|
|
require_relative "../lib/log"
|
|
|
-TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
|
|
|
|
|
|
|
+
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_huonvalley
|
|
|
START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
|
|
START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
-REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
|
|
|
|
|
|
|
+# DA-37/2026 or DA 37/2026 (number/year order)
|
|
|
|
|
+REF_RX = /\bDA[-\s]?\d{1,4}\/20\d{2}\b/i
|
|
|
|
|
|
|
|
def abs_url(base, href)
|
|
def abs_url(base, href)
|
|
|
- return "" if href.to_s.strip.empty?
|
|
|
|
|
- URI.join(base, href).to_s rescue href.to_s
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def nearest_heading_text(node)
|
|
|
|
|
- h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
|
|
|
|
|
- h ? h.text.strip : ""
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def proposal_between_heading_and(node)
|
|
|
|
|
- # Walk back to the nearest heading, then take the first non-empty text sibling after it
|
|
|
|
|
- h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
|
|
|
|
|
- return "" unless h
|
|
|
|
|
- sib = h
|
|
|
|
|
- 12.times do
|
|
|
|
|
- sib = sib.next_element
|
|
|
|
|
- break if sib.nil?
|
|
|
|
|
- t = sib.text.strip.gsub(/\s+/, " ")
|
|
|
|
|
- next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i)
|
|
|
|
|
- return t
|
|
|
|
|
- end
|
|
|
|
|
- ""
|
|
|
|
|
|
|
+ return nil if href.to_s.strip.empty?
|
|
|
|
|
+ URI.join(base, href).to_s
|
|
|
|
|
+rescue URI::InvalidURIError
|
|
|
|
|
+ nil
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
def parse_page(html, base_url)
|
|
def parse_page(html, base_url)
|
|
|
- doc = Nokogiri::HTML(html)
|
|
|
|
|
-
|
|
|
|
|
- # Each application has a SharePoint doc link labeled “Copy of application for viewing”
|
|
|
|
|
- anchors = doc.css("a").select { |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i)
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- rows = []
|
|
|
|
|
-
|
|
|
|
|
- anchors.each do |a|
|
|
|
|
|
- document_url = abs_url(base_url, a["href"])
|
|
|
|
|
- heading = nearest_heading_text(a)
|
|
|
|
|
- ref = heading[/#{REF_RX}/]&.strip || ""
|
|
|
|
|
-
|
|
|
|
|
- # Get a one-line proposal that appears just after the heading
|
|
|
|
|
- description = proposal_between_heading_and(a)
|
|
|
|
|
- description = "Development Application" if description.empty?
|
|
|
|
|
-
|
|
|
|
|
- # Address sometimes appears in the proposal. If not, keep a readable fallback.
|
|
|
|
|
- address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i)
|
|
|
|
|
- description
|
|
|
|
|
- else
|
|
|
|
|
- heading
|
|
|
|
|
|
|
+ doc = Nokogiri::HTML(html)
|
|
|
|
|
+ rows = []
|
|
|
|
|
+
|
|
|
|
|
+ doc.css("h3.application-heading").each do |h3|
|
|
|
|
|
+ ref = h3.text.strip
|
|
|
|
|
+ next unless ref.match?(REF_RX)
|
|
|
|
|
+
|
|
|
|
|
+ # Walk forward siblings to find h4 (description+address) and
|
|
|
|
|
+ # the .more-information div (document link)
|
|
|
|
|
+ desc_addr = nil
|
|
|
|
|
+ document_url = nil
|
|
|
|
|
+
|
|
|
|
|
+ sib = h3.next_element
|
|
|
|
|
+ 10.times do
|
|
|
|
|
+ break if sib.nil?
|
|
|
|
|
+ if sib.name == "h4" && desc_addr.nil?
|
|
|
|
|
+ desc_addr = sib.text.strip.gsub(/\s+/, " ")
|
|
|
|
|
+ end
|
|
|
|
|
+ if sib["class"].to_s.include?("more-information")
|
|
|
|
|
+ link = sib.at_css("a[href]")
|
|
|
|
|
+ document_url = abs_url(base_url, link["href"]) if link
|
|
|
|
|
+ break
|
|
|
|
|
+ end
|
|
|
|
|
+ # Stop if we hit the next application heading
|
|
|
|
|
+ break if sib.name == "h3" && sib["class"].to_s.include?("application-heading")
|
|
|
|
|
+ sib = sib.next_element
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ next if desc_addr.nil? || desc_addr.empty?
|
|
|
|
|
+
|
|
|
|
|
+ # Split "Dwelling description, 100 Street Name, Suburb (CT-ref)" into
|
|
|
|
|
+ # description + address. Address starts at the first ", <digits> " pattern.
|
|
|
|
|
+ description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
|
|
|
|
|
+ [m[1].strip, m[2].strip]
|
|
|
|
|
+ else
|
|
|
|
|
+ ["Development Application", desc_addr]
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ # Strip land-title reference from end of address: "(CT-237651/1)"
|
|
|
|
|
+ address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
|
|
|
|
|
+
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
+
|
|
|
|
|
+ rows << {
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address[0, 255],
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ date_received_raw: "",
|
|
|
|
|
+ date_received: nil,
|
|
|
|
|
+ document_url: document_url
|
|
|
|
|
+ }
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
- next if ref.empty? || address.empty?
|
|
|
|
|
-
|
|
|
|
|
- rows << {
|
|
|
|
|
- council_reference: ref,
|
|
|
|
|
- address: address,
|
|
|
|
|
- description: description,
|
|
|
|
|
- date_received_raw: "",
|
|
|
|
|
- date_received: nil,
|
|
|
|
|
- document_url: document_url
|
|
|
|
|
- }
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- # Find a Next link for pagination
|
|
|
|
|
- next_href = nil
|
|
|
|
|
- if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" })
|
|
|
|
|
- next_href = abs_url(base_url, next_a["href"])
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- [rows, next_href]
|
|
|
|
|
|
|
+ # Pagination: find a "Next" link
|
|
|
|
|
+ next_href = nil
|
|
|
|
|
+ if (next_a = doc.css("a").find { |a| a.text.strip.downcase == "next" })
|
|
|
|
|
+ next_href = abs_url(base_url, next_a["href"])
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ [rows, next_href]
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-saved = 0
|
|
|
|
|
-url = START_URL
|
|
|
|
|
-seen_refs = {}
|
|
|
|
|
|
|
+saved = 0
|
|
|
|
|
+url = START_URL
|
|
|
|
|
+seen = {}
|
|
|
|
|
|
|
|
loop do
|
|
loop do
|
|
|
- begin
|
|
|
|
|
- html = Http.get(url)
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Failed to fetch #{url}: #{e.class} #{e.message}"
|
|
|
|
|
- break
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- rows, next_url = parse_page(html, url)
|
|
|
|
|
-
|
|
|
|
|
- rows.each do |r|
|
|
|
|
|
- # de-dup within a run
|
|
|
|
|
- next if seen_refs[[r[:council_reference], r[:address]]]
|
|
|
|
|
- seen_refs[[r[:council_reference], r[:address]]] = true
|
|
|
|
|
-
|
|
|
|
|
- DB.upsert(TABLE, {
|
|
|
|
|
- description: r[:description],
|
|
|
|
|
- date_received: r[:date_received],
|
|
|
|
|
- date_received_raw: r[:date_received_raw],
|
|
|
|
|
- address: r[:address],
|
|
|
|
|
- council_reference: r[:council_reference],
|
|
|
|
|
- document_url: r[:document_url],
|
|
|
|
|
- applicant: "",
|
|
|
|
|
- owner: ""
|
|
|
|
|
- })
|
|
|
|
|
-
|
|
|
|
|
- enrich_after_upsert!(
|
|
|
|
|
- table: TABLE,
|
|
|
|
|
- council_reference: r[:council_reference],
|
|
|
|
|
- address: r[:address]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
|
|
|
|
|
- saved += 1
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- break if next_url.nil? || next_url == url
|
|
|
|
|
- url = next_url
|
|
|
|
|
|
|
+ html = begin
|
|
|
|
|
+ Http.get(url)
|
|
|
|
|
+ rescue StandardError => e
|
|
|
|
|
+ Log.warn "huonvalley", "Failed to fetch #{url}: #{e.class} #{e.message}"
|
|
|
|
|
+ break
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ rows, next_url = parse_page(html, url)
|
|
|
|
|
+ puts "Found #{rows.length} item(s) on #{url}"
|
|
|
|
|
+
|
|
|
|
|
+ rows.each do |r|
|
|
|
|
|
+ key = [r[:council_reference], r[:address]]
|
|
|
|
|
+ next if seen[key]
|
|
|
|
|
+ seen[key] = true
|
|
|
|
|
+
|
|
|
|
|
+ begin
|
|
|
|
|
+ DB.upsert(TABLE, {
|
|
|
|
|
+ description: r[:description],
|
|
|
|
|
+ date_received: r[:date_received],
|
|
|
|
|
+ date_received_raw: r[:date_received_raw],
|
|
|
|
|
+ address: r[:address],
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ document_url: r[:document_url],
|
|
|
|
|
+ applicant: "",
|
|
|
|
|
+ owner: ""
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ enrich_after_upsert!(
|
|
|
|
|
+ table: TABLE,
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ address: r[:address]
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ Log.info "huonvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
|
|
|
|
|
+ saved += 1
|
|
|
|
|
+ rescue StandardError => e
|
|
|
|
|
+ Log.warn "huonvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
|
|
|
|
|
+ end
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ break if next_url.nil? || next_url == url
|
|
|
|
|
+ url = next_url
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
puts "Done #{TABLE}. Saved #{saved} item(s)."
|
|
puts "Done #{TABLE}. Saved #{saved} item(s)."
|