Kaynağa Gözat

update to webFetch

Benjamin Harris 2 ay önce
ebeveyn
işleme
1ad06e1c48
2 değiştirilmiş dosya ile 17 ekleme ve 33 silme
  1. 3 1
      .claude/settings.local.json
  2. 14 32
      scrapers/huonvalley.rb

+ 3 - 1
.claude/settings.local.json

@@ -29,7 +29,9 @@
       "Bash(grep -l \"def abs_url\" *.rb)",
       "WebFetch(domain:www.southernmidlands.tas.gov.au)",
       "Bash(python3 -)",
-      "WebFetch(domain:www.huonvalley.tas.gov.au)"
+      "WebFetch(domain:www.huonvalley.tas.gov.au)",
+      "Bash(curl -s -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/\")",
+      "Bash(python3 -c \" import sys from html.parser import HTMLParser class P\\(HTMLParser\\): def __init__\\(self\\): super\\(\\).__init__\\(\\) self.depth = 0 self.capture = False self.tag = None def handle_starttag\\(self, tag, attrs\\): d = dict\\(attrs\\) cls = d.get\\('class',''\\) if 'accordion' in cls or 'plan-file' in cls: print\\(f'<{tag} class=\\\\\"{cls}\\\\\">'\\) self.capture = True def handle_data\\(self, data\\): if self.capture and data.strip\\(\\): print\\(f' TEXT: {data.strip\\(\\)[:120]}'\\) def handle_endtag\\(self, tag\\): if tag in \\('h2','h3','a','p','div'\\) and self.capture: self.capture = False P\\(\\).feed\\(sys.stdin.read\\(\\)\\) \")"
     ]
   }
 }

+ 14 - 32
scrapers/huonvalley.rb

@@ -1,13 +1,12 @@
 # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
 # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
 #
-# Page structure per application (flat siblings, no wrapper div):
-#   <h2>DA-37/2026</h2>
-#   <p>Description, Address (CT-land-title-ref)</p>
-#   <h3>More Information</h3>
-#   <a href="mapbox...">...</a>
-#   <h3>Available Documents:</h3>
-#   <a href="sharepoint...">Copy of application for viewing</a>
+# Page structure per application:
+#   <div class="accordion-grid-item">
+#     <h2 class="accordion-grid-item__title">DA-37/2026</h2>
+#     <div class="accordion-grid-item__description">Description, Address (CT-ref)</div>
+#     <a class="plan-file-list__item" href="sharepoint...">Copy of application for viewing</a>
+#   </div>
 
 require "nokogiri"
 require "uri"
@@ -37,35 +36,18 @@ def parse_page(html, base_url)
     doc  = Nokogiri::HTML(html)
     rows = []
 
-    # Drive from each plain <h2> whose text matches the DA ref pattern
-    doc.css("h2").each do |h2|
-        ref = h2.text.strip
-        next unless ref.match?(REF_RX)
-
-        desc_addr    = nil
-        document_url = nil
-
-        sib = h2.next_element
-        15.times do
-            break if sib.nil?
-            # First <p> after the heading holds description + address
-            if sib.name == "p" && desc_addr.nil?
-                desc_addr = sib.text.strip.gsub(/\s+/, " ")
-            end
-            # Document link follows <h3>Available Documents:</h3>
-            if sib.name == "a" && sib.text.strip.match?(/copy of application for viewing/i)
-                document_url = abs_url(base_url, sib["href"])
-                break
-            end
-            # Stop at the next application's <h2>
-            break if sib.name == "h2" && sib.text.strip.match?(REF_RX)
-            sib = sib.next_element
-        end
+    doc.css("div.accordion-grid-item").each do |item|
+        ref       = item.at_css("h2.accordion-grid-item__title")&.text&.strip
+        desc_addr = item.at_css("div.accordion-grid-item__description")&.text&.strip&.gsub(/\s+/, " ")
+        doc_link  = item.at_css("a.plan-file-list__item")&.[]("href")
 
+        next if ref.nil? || !ref.match?(REF_RX)
         next if desc_addr.nil? || desc_addr.empty?
 
+        document_url = abs_url(base_url, doc_link)
+
         # Split "Dwelling, outbuilding..., 100 Turners Road, Cradoc (CT-237651/1)"
-        # into description and address at the first ", <number> " pattern
+        # into description + address at the first ", <digits> " pattern
         description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
             [m[1].strip, m[2].strip]
         else