|
@@ -1,13 +1,13 @@
|
|
|
# Huon Valley Council — Advertised Applications (site page, not PlanBuild)
|
|
# Huon Valley Council — Advertised Applications (site page, not PlanBuild)
|
|
|
# Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
|
|
# Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
|
|
|
#
|
|
#
|
|
|
-# Page structure per application:
|
|
|
|
|
-# <h3 class="application-heading">DA-37/2026</h3>
|
|
|
|
|
-# <h4>Description, Address (land title ref)</h4>
|
|
|
|
|
-# <div class="more-information">
|
|
|
|
|
-# <h3>Available Documents:</h3>
|
|
|
|
|
-# <a href="sharepoint...">Copy of application for viewing</a>
|
|
|
|
|
-# </div>
|
|
|
|
|
|
|
+# Page structure per application (flat siblings, no wrapper div):
|
|
|
|
|
+# <h2>DA-37/2026</h2>
|
|
|
|
|
+# <p>Description, Address (CT-land-title-ref)</p>
|
|
|
|
|
+# <h3>More Information</h3>
|
|
|
|
|
+# <a href="mapbox...">...</a>
|
|
|
|
|
+# <h3>Available Documents:</h3>
|
|
|
|
|
+# <a href="sharepoint...">Copy of application for viewing</a>
|
|
|
|
|
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
require "uri"
|
|
@@ -37,42 +37,42 @@ def parse_page(html, base_url)
|
|
|
doc = Nokogiri::HTML(html)
|
|
doc = Nokogiri::HTML(html)
|
|
|
rows = []
|
|
rows = []
|
|
|
|
|
|
|
|
- doc.css("h3.application-heading").each do |h3|
|
|
|
|
|
- ref = h3.text.strip
|
|
|
|
|
|
|
+ # Drive from each plain <h2> whose text matches the DA ref pattern
|
|
|
|
|
+ doc.css("h2").each do |h2|
|
|
|
|
|
+ ref = h2.text.strip
|
|
|
next unless ref.match?(REF_RX)
|
|
next unless ref.match?(REF_RX)
|
|
|
|
|
|
|
|
- # Walk forward siblings to find h4 (description+address) and
|
|
|
|
|
- # the .more-information div (document link)
|
|
|
|
|
desc_addr = nil
|
|
desc_addr = nil
|
|
|
document_url = nil
|
|
document_url = nil
|
|
|
|
|
|
|
|
- sib = h3.next_element
|
|
|
|
|
- 10.times do
|
|
|
|
|
|
|
+ sib = h2.next_element
|
|
|
|
|
+ 15.times do
|
|
|
break if sib.nil?
|
|
break if sib.nil?
|
|
|
- if sib.name == "h4" && desc_addr.nil?
|
|
|
|
|
|
|
+ # First <p> after the heading holds description + address
|
|
|
|
|
+ if sib.name == "p" && desc_addr.nil?
|
|
|
desc_addr = sib.text.strip.gsub(/\s+/, " ")
|
|
desc_addr = sib.text.strip.gsub(/\s+/, " ")
|
|
|
end
|
|
end
|
|
|
- if sib["class"].to_s.include?("more-information")
|
|
|
|
|
- link = sib.at_css("a[href]")
|
|
|
|
|
- document_url = abs_url(base_url, link["href"]) if link
|
|
|
|
|
|
|
+ # Document link follows <h3>Available Documents:</h3>
|
|
|
|
|
+ if sib.name == "a" && sib.text.strip.match?(/copy of application for viewing/i)
|
|
|
|
|
+ document_url = abs_url(base_url, sib["href"])
|
|
|
break
|
|
break
|
|
|
end
|
|
end
|
|
|
- # Stop if we hit the next application heading
|
|
|
|
|
- break if sib.name == "h3" && sib["class"].to_s.include?("application-heading")
|
|
|
|
|
|
|
+ # Stop at the next application's <h2>
|
|
|
|
|
+ break if sib.name == "h2" && sib.text.strip.match?(REF_RX)
|
|
|
sib = sib.next_element
|
|
sib = sib.next_element
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
next if desc_addr.nil? || desc_addr.empty?
|
|
next if desc_addr.nil? || desc_addr.empty?
|
|
|
|
|
|
|
|
- # Split "Dwelling description, 100 Street Name, Suburb (CT-ref)" into
|
|
|
|
|
- # description + address. Address starts at the first ", <digits> " pattern.
|
|
|
|
|
|
|
+ # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
|
|
|
|
|
+ # into description and address at the first ", <number> " pattern
|
|
|
description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
|
|
description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
|
|
|
[m[1].strip, m[2].strip]
|
|
[m[1].strip, m[2].strip]
|
|
|
else
|
|
else
|
|
|
["Development Application", desc_addr]
|
|
["Development Application", desc_addr]
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
- # Strip land-title reference from end of address: "(CT-237651/1)"
|
|
|
|
|
|
|
+ # Strip cadastral reference from end of address: "(CT-237651/1)"
|
|
|
address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
|
|
address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
|
|
|
|
|
|
|
|
next if address.empty?
|
|
next if address.empty?
|