Sfoglia il codice sorgente

huonvalley changes v1

Benjamin Harris 2 mesi fa
parent
commit
598ac7370e
2 ha cambiato i file con 24 aggiunte e 23 eliminazioni
  1. 2 1
      .claude/settings.local.json
  2. 22 22
      scrapers/huonvalley.rb

+ 2 - 1
.claude/settings.local.json

@@ -28,7 +28,8 @@
       "Bash(grep -l \"upsert_and_enrich!\" *.rb)",
       "Bash(grep -l \"def abs_url\" *.rb)",
       "WebFetch(domain:www.southernmidlands.tas.gov.au)",
-      "Bash(python3 -)"
+      "Bash(python3 -)",
+      "WebFetch(domain:www.huonvalley.tas.gov.au)"
     ]
   }
 }

+ 22 - 22
scrapers/huonvalley.rb

@@ -1,13 +1,13 @@
 # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
 # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
 #
-# Page structure per application:
-#   <h3 class="application-heading">DA-37/2026</h3>
-#   <h4>Description, Address (land title ref)</h4>
-#   <div class="more-information">
-#     <h3>Available Documents:</h3>
-#     <a href="sharepoint...">Copy of application for viewing</a>
-#   </div>
+# Page structure per application (flat siblings, no wrapper div):
+#   <h2>DA-37/2026</h2>
+#   <p>Description, Address (CT-land-title-ref)</p>
+#   <h3>More Information</h3>
+#   <a href="mapbox...">...</a>
+#   <h3>Available Documents:</h3>
+#   <a href="sharepoint...">Copy of application for viewing</a>
 
 require "nokogiri"
 require "uri"
@@ -37,42 +37,42 @@ def parse_page(html, base_url)
     doc  = Nokogiri::HTML(html)
     rows = []
 
-    doc.css("h3.application-heading").each do |h3|
-        ref = h3.text.strip
+    # Drive from each plain <h2> whose text matches the DA ref pattern
+    doc.css("h2").each do |h2|
+        ref = h2.text.strip
         next unless ref.match?(REF_RX)
 
-        # Walk forward siblings to find h4 (description+address) and
-        # the .more-information div (document link)
         desc_addr    = nil
         document_url = nil
 
-        sib = h3.next_element
-        10.times do
+        sib = h2.next_element
+        15.times do
             break if sib.nil?
-            if sib.name == "h4" && desc_addr.nil?
+            # First <p> after the heading holds description + address
+            if sib.name == "p" && desc_addr.nil?
                 desc_addr = sib.text.strip.gsub(/\s+/, " ")
             end
-            if sib["class"].to_s.include?("more-information")
-                link = sib.at_css("a[href]")
-                document_url = abs_url(base_url, link["href"]) if link
+            # Document link follows <h3>Available Documents:</h3>
+            if sib.name == "a" && sib.text.strip.match?(/copy of application for viewing/i)
+                document_url = abs_url(base_url, sib["href"])
                 break
             end
-            # Stop if we hit the next application heading
-            break if sib.name == "h3" && sib["class"].to_s.include?("application-heading")
+            # Stop at the next application's <h2>
+            break if sib.name == "h2" && sib.text.strip.match?(REF_RX)
             sib = sib.next_element
         end
 
         next if desc_addr.nil? || desc_addr.empty?
 
-        # Split "Dwelling description, 100 Street Name, Suburb (CT-ref)" into
-        # description + address. Address starts at the first ", <digits> " pattern.
+        # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
+        # into description and address at the first ", <number> " pattern
         description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
             [m[1].strip, m[2].strip]
         else
             ["Development Application", desc_addr]
         end
 
-        # Strip land-title reference from end of address: "(CT-237651/1)"
+        # Strip cadastral reference from end of address: "(CT-237651/1)"
         address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
 
         next if address.empty?