소스 검색

huonvalley updates

Benjamin Harris 2 달 전
부모
커밋
2129e6b505
1개의 변경된 파일123개의 추가작업 그리고 108개의 파일을 삭제
  1. 123 108
      scrapers/huonvalley.rb

+ 123 - 108
scrapers/huonvalley.rb

@@ -1,133 +1,148 @@
 # Huon Valley Council — Advertised Applications (site page, not PlanBuild)
 # Source: https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/
+#
+# Page structure per application:
+#   <h3 class="application-heading">DA-37/2026</h3>
+#   <h4>Description, Address (land title ref)</h4>
+#   <div class="more-information">
+#     <h3>Available Documents:</h3>
+#     <a href="sharepoint...">Copy of application for viewing</a>
+#   </div>
 
 require "nokogiri"
+require "uri"
 require "cgi"
-
+require_relative "../lib/http"
+require_relative "../lib/db"
+require_relative "../lib/util"
 require_relative "../lib/enrich"
 require_relative "../lib/log"
-TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_huonvalley
+
+TABLE     = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_huonvalley
 START_URL = "https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/"
 
 DB.ensure_table!(TABLE)
 
-REF_RX = %r{\bDA[-\s]?\d{1,4}/20\d{2}\b}i
+# DA-37/2026 or DA 37/2026 (number/year order)
+REF_RX = /\bDA[-\s]?\d{1,4}\/20\d{2}\b/i
 
 def abs_url(base, href)
-  return "" if href.to_s.strip.empty?
-  URI.join(base, href).to_s rescue href.to_s
-end
-
-def nearest_heading_text(node)
-  h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
-  h ? h.text.strip : ""
-end
-
-def proposal_between_heading_and(node)
-  # Walk back to the nearest heading, then take the first non-empty text sibling after it
-  h = node.xpath("preceding::h2[1] | preceding::h3[1]").first
-  return "" unless h
-  sib = h
-  12.times do
-    sib = sib.next_element
-    break if sib.nil?
-    t = sib.text.strip.gsub(/\s+/, " ")
-    next if t.empty? || t.match?(/More Information/i) || t.match?(/Available Documents/i)
-    return t
-  end
-  ""
+    return nil if href.to_s.strip.empty?
+    URI.join(base, href).to_s
+rescue URI::InvalidURIError
+    nil
 end
 
 def parse_page(html, base_url)
-  doc = Nokogiri::HTML(html)
-
-  # Each application has a SharePoint doc link labeled “Copy of application for viewing”
-  anchors = doc.css("a").select { |a|
-    href = a["href"].to_s
-    a.text.to_s.strip.match?(/copy of application for viewing/i) || href.match?(/huonvalleycouncil\.sharepoint\.com/i)
-  }
-
-  rows = []
-
-  anchors.each do |a|
-    document_url = abs_url(base_url, a["href"])
-    heading      = nearest_heading_text(a)
-    ref          = heading[/#{REF_RX}/]&.strip || ""
-
-    # Get a one-line proposal that appears just after the heading
-    description  = proposal_between_heading_and(a)
-    description  = "Development Application" if description.empty?
-
-    # Address sometimes appears in the proposal. If not, keep a readable fallback.
-    address = if description.match?(/\d+ .*?\b(TAS|Huon|Franklin|Cygnet|Dover|Ranelagh)\b/i)
-      description
-    else
-      heading
+    doc  = Nokogiri::HTML(html)
+    rows = []
+
+    doc.css("h3.application-heading").each do |h3|
+        ref = h3.text.strip
+        next unless ref.match?(REF_RX)
+
+        # Walk forward siblings to find h4 (description+address) and
+        # the .more-information div (document link)
+        desc_addr    = nil
+        document_url = nil
+
+        sib = h3.next_element
+        10.times do
+            break if sib.nil?
+            if sib.name == "h4" && desc_addr.nil?
+                desc_addr = sib.text.strip.gsub(/\s+/, " ")
+            end
+            if sib["class"].to_s.include?("more-information")
+                link = sib.at_css("a[href]")
+                document_url = abs_url(base_url, link["href"]) if link
+                break
+            end
+            # Stop if we hit the next application heading
+            break if sib.name == "h3" && sib["class"].to_s.include?("application-heading")
+            sib = sib.next_element
+        end
+
+        next if desc_addr.nil? || desc_addr.empty?
+
+        # Split "Dwelling description, 100 Street Name, Suburb (CT-ref)" into
+        # description + address. Address starts at the first ", <digits> " pattern.
+        description, address = if (m = desc_addr.match(/\A(.+?),\s*(\d+\s+\S.+)\z/m))
+            [m[1].strip, m[2].strip]
+        else
+            ["Development Application", desc_addr]
+        end
+
+        # Strip land-title reference from end of address: "(CT-237651/1)"
+        address = address.sub(/\s*\(CT-[\d\/]+\)\s*\z/, "").strip
+
+        next if address.empty?
+
+        rows << {
+            council_reference: ref,
+            address:           address[0, 255],
+            description:       description,
+            date_received_raw: "",
+            date_received:     nil,
+            document_url:      document_url
+        }
     end
 
-    next if ref.empty? || address.empty?
-
-    rows << {
-      council_reference: ref,
-      address: address,
-      description: description,
-      date_received_raw: "",
-      date_received: nil,
-      document_url: document_url
-    }
-  end
-
-  # Find a Next link for pagination
-  next_href = nil
-  if (next_a = doc.css("a").find { |x| x.text.to_s.strip.downcase == "next" })
-    next_href = abs_url(base_url, next_a["href"])
-  end
-
-  [rows, next_href]
+    # Pagination: find a "Next" link
+    next_href = nil
+    if (next_a = doc.css("a").find { |a| a.text.strip.downcase == "next" })
+        next_href = abs_url(base_url, next_a["href"])
+    end
+
+    [rows, next_href]
 end
 
-saved = 0
-url = START_URL
-seen_refs = {}
+saved    = 0
+url      = START_URL
+seen     = {}
 
 loop do
-  begin
-    html = Http.get(url)
-  rescue StandardError => e
-    Log.warn "scraper", "Failed to fetch #{url}: #{e.class} #{e.message}"
-    break
-  end
-
-  rows, next_url = parse_page(html, url)
-
-  rows.each do |r|
-    # de-dup within a run
-    next if seen_refs[[r[:council_reference], r[:address]]]
-    seen_refs[[r[:council_reference], r[:address]]] = true
-
-    DB.upsert(TABLE, {
-      description:       r[:description],
-      date_received:     r[:date_received],
-      date_received_raw: r[:date_received_raw],
-      address:           r[:address],
-      council_reference: r[:council_reference],
-      document_url:      r[:document_url],
-      applicant:         "",
-      owner:             ""
-    })
-
-    enrich_after_upsert!(
-      table:             TABLE,
-      council_reference: r[:council_reference],
-      address:           r[:address]
-    )
-
-    puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
-    saved += 1
-  end
-
-  break if next_url.nil? || next_url == url
-  url = next_url
+    html = begin
+        Http.get(url)
+    rescue StandardError => e
+        Log.warn "huonvalley", "Failed to fetch #{url}: #{e.class} #{e.message}"
+        break
+    end
+
+    rows, next_url = parse_page(html, url)
+    puts "Found #{rows.length} item(s) on #{url}"
+
+    rows.each do |r|
+        key = [r[:council_reference], r[:address]]
+        next if seen[key]
+        seen[key] = true
+
+        begin
+            DB.upsert(TABLE, {
+                description:       r[:description],
+                date_received:     r[:date_received],
+                date_received_raw: r[:date_received_raw],
+                address:           r[:address],
+                council_reference: r[:council_reference],
+                document_url:      r[:document_url],
+                applicant:         "",
+                owner:             ""
+            })
+
+            enrich_after_upsert!(
+                table:             TABLE,
+                council_reference: r[:council_reference],
+                address:           r[:address]
+            )
+
+            Log.info "huonvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
+            saved += 1
+        rescue StandardError => e
+            Log.warn "huonvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
+        end
+    end
+
+    break if next_url.nil? || next_url == url
+    url = next_url
 end
 
 puts "Done #{TABLE}. Saved #{saved} item(s)."