Selaa lähdekoodia

King Island Initial Commit

Benjamin Harris 2 kuukautta sitten
vanhempi
sitoutus
7eb3f2709b
2 muutettua tiedostoa jossa 243 lisäystä ja 7 poistoa
  1. 9 1
      .claude/settings.local.json
  2. 234 6
      scrapers/kingisland.rb

+ 9 - 1
.claude/settings.local.json

@@ -39,7 +39,15 @@
       "Bash(curl -sv -L --max-time 15 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36' --compressed https://www.burnie.tas.gov.au/Development/Planning/Permit-applications-on-exhibition)",
       "Bash(curl -s --max-time 10 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36' 'https://www.derwentvalley.tas.gov.au/home/latest-news?f.News+category%7CnewsCategory=Public+Notice')",
       "Bash(sed -n '80,130p' f:/GIT_REPO/tas_councils/scrapers/derwentvalley.rb)",
-      "Bash(sed -n '50,75p' f:/GIT_REPO/tas_councils/scrapers/centralhighlands.rb)"
+      "Bash(sed -n '50,75p' f:/GIT_REPO/tas_councils/scrapers/centralhighlands.rb)",
+      "Bash(curl -s -L --max-time 30 -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://kingisland.tas.gov.au/develop/planning/\")",
+      "Bash(curl -s -L --max-time 30 -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/120.0.0.0 Safari/537.36\" -H \"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\" -H \"Accept-Language: en-AU,en;q=0.9\" \"https://www.kingisland.tas.gov.au/develop/planning/\")",
+      "Bash(python3 -c \"import sys; d=sys.stdin.read\\(\\); print\\('CF:', 'Just a moment' in d or 'Enable JavaScript' in d\\); print\\('403:', '403' in d[:500]\\); print\\(d[:2000]\\)\")",
+      "Bash(curl -v -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: gzip,deflate' -H 'Referer: https://kingisland.tas.gov.au/' -c /tmp/ki3.txt -b /tmp/ki3.txt https://kingisland.tas.gov.au/)",
+      "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o /tmp/ki_plan.html)",
+      "Read(//tmp/**)",
+      "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o C:/Users/lumion/AppData/Local/Temp/ki_plan.html)",
+      "Read(//c/Users/lumion/AppData/Local/Temp/**)"
     ]
   }
 }

+ 234 - 6
scrapers/kingisland.rb

@@ -1,11 +1,239 @@
-# George Town Council — Development Applications (site page, not PlanBuild)
+# King Island Council — Advertised Development Applications
+#
+# Source: https://kingisland.tas.gov.au/develop/planning/
+#
+# The site returns HTTP 403 on direct requests but succeeds after a homepage
+# warmup using browser-like headers (same technique as burnie.rb).
+# Accept-Encoding: identity is used to avoid gzip decompression complexity.
+#
+# Page structure (WordPress accordion, id="accordion-1-c4"):
+#   <h2>Advertised development applications</h2>
+#   <p class="entry-title">...(preamble)...</p>
+#   <p>Notice of Planning Application – DA 2025/28 15 Kurrajong Street,
+#      Grassy, TAS 7256 – Visitor/workers' Accommodation.</p>
+#   <p>...representations no later than 2 April 2026...</p>
+#   <p><a href="https://kingisland.tas.gov.au/wp-content/uploads/DA-2025-28-...pdf">here</a></p>
 
+require "date"
 require "nokogiri"
-require_relative "../lib/http"
+require "net/http"
+require "uri"
+
+require_relative "../lib/db"
+require_relative "../lib/enrich"
+require_relative "../lib/log"
 require_relative "../lib/util"
-require_relative "../lib/scraper_helpers"
 
-TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh sets from filename: da_georgetown
-URL   = "https://kingisland.tas.gov.au/develop/planning/"
+TABLE    = ENV.fetch("TABLE_NAME")  # run_all.sh sets from filename: da_kingisland
+BASE_URL = "https://kingisland.tas.gov.au"
+URL      = "#{BASE_URL}/develop/planning/"
+
+DB.ensure_table!(TABLE)
+
+# ----- Browser-like headers (WAF warmup technique from burnie.rb) -----
+UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
+     "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+
+BASE_HEADERS = {
+    "User-Agent"                => UA,
+    "Accept"                    => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language"           => "en-AU,en;q=0.8",
+    "Accept-Encoding"           => "identity",
+    "Upgrade-Insecure-Requests" => "1",
+    "Sec-Fetch-Dest"            => "document",
+    "Sec-Fetch-Mode"            => "navigate",
+    "Sec-Fetch-Site"            => "none",
+    "Sec-Fetch-User"            => "?1",
+    "sec-ch-ua"                 => '"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"',
+    "sec-ch-ua-platform"        => '"Windows"',
+    "sec-ch-ua-mobile"          => "?0",
+    "Connection"                => "close",
+}.freeze
+
+class CookieJar
+    def initialize; @h = {}; end
+
+    def for(host)
+        @h[host] || ""
+    end
+
+    def merge_from(resp, host)
+        cookies = resp.get_fields("Set-Cookie") || []
+        return if cookies.empty?
+        existing = parse_header(@h[host])
+        cookies.each do |sc|
+            kv = sc.split(";", 2).first
+            k, v = kv.split("=", 2)
+            existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
+        end
+        @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
+    end
+
+    private
+
+    def parse_header(s)
+        s.to_s.split(";").map(&:strip).filter_map { |kv|
+            k, v = kv.split("=", 2)
+            [k, v] unless k.to_s.empty?
+        }.to_h
+    end
+end
+
+def http_get(url, jar:, referer: nil, fetch_site: "none")
+    uri  = URI(url)
+    hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
+    hdrs["Referer"] = referer if referer
+    cookie = jar.for(uri.host)
+    hdrs["Cookie"] = cookie unless cookie.empty?
+
+    limit = 5
+    code  = 0
+    body  = ""
+
+    while limit > 0
+        req = Net::HTTP::Get.new(uri, hdrs)
+        Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
+            resp = http.request(req)
+            jar.merge_from(resp, uri.host)
+            code = resp.code.to_i
+
+            if [301, 302, 303, 307, 308].include?(code) && resp["location"]
+                uri = URI.join(uri, resp["location"])
+                limit -= 1
+                next
+            end
+
+            body = resp.body.to_s
+        end
+        break
+    end
+
+    [code, body]
+rescue StandardError => e
+    Log.warn "kingisland", "HTTP error for #{url}: #{e.class} #{e.message}"
+    [0, ""]
+end
+
+# ----- Warmup: hit homepage first to get cookies, then fetch planning page -----
+jar = CookieJar.new
+
+Log.info "kingisland", "Warming up via homepage..."
+code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
+Log.info "kingisland", "Homepage: #{code0}"
+
+sleep(0.5)
+
+Log.info "kingisland", "Fetching planning page..."
+code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
+Log.info "kingisland", "Planning page: #{code1} (#{html.bytesize} bytes)"
+
+if code1 != 200 || html.bytesize < 5_000
+    Log.warn "kingisland", "Could not fetch planning page (status #{code1}). " \
+        "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
+    exit 0
+end
+
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
+    Log.warn "kingisland", "Cloudflare challenge returned. " \
+        "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
+    exit 0
+end
+
+# ----- Parse -----
+# Ref format: DA 2025/28  (year/sequential)
+REF_RX = /\bDA\s*\d{4}\/\d{1,4}\b/i
+
+doc = Nokogiri::HTML(html)
+
+# The advertised applications are inside div#accordion-1-c4.
+# If the div id ever changes, fall back to finding the h2 by text.
+section = doc.at_css("div#accordion-1-c4") ||
+          doc.xpath('//h2[contains(translate(., "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "advertised development")]')&.parent
+
+unless section
+    Log.warn "kingisland", "Could not find 'Advertised development applications' section on page."
+    puts "Done #{TABLE}. Saved 0 item(s)."
+    exit 0
+end
+
+paragraphs = section.css("p").to_a
+saved = 0
+
+paragraphs.each_with_index do |para, idx|
+    text = para.text.gsub(/[[:space:]]+/, " ").strip
+    next unless (m = text.match(REF_RX))
+
+    ref = m[0].strip.gsub(/\s+/, " ")
+
+    # Strip any "Notice of Planning Application" prefix and the ref itself,
+    # leaving "ADDRESS – DESCRIPTION."
+    rest = text
+        .sub(/Notice\s+of\s+Planning\s+Application\s*[-\u2013\u2014]?\s*/i, "")
+        .sub(ref, "")
+        .gsub(/\A[\s\-\u2013\u2014]+/, "")
+        .gsub(/[.\s]+\z/, "")
+
+    # Split at last " – " (en-dash) or " - " to separate address from description
+    if (split_idx = rest.rindex(/\s[\-\u2013\u2014]\s/))
+        address     = rest[0, split_idx].strip
+        description = rest[(split_idx + 1)..]&.gsub(/\A[\s\-\u2013\u2014]+/, "")&.strip
+    else
+        address     = rest.strip
+        description = "Development Application"
+    end
+
+    next if address.empty?
+
+    # Scan forward up to 5 paragraphs for closing date and PDF link
+    on_notice_to_raw = ""
+    on_notice_to     = nil
+    doc_url          = nil
+
+    (1..5).each do |offset|
+        break if idx + offset >= paragraphs.length
+        fwd      = paragraphs[idx + offset]
+        fwd_text = fwd.text.gsub(/[[:space:]]+/, " ").strip
+
+        if on_notice_to_raw.empty? && fwd_text =~ /no\s+later\s+than|representations|closing/i
+            if (dm = fwd_text.match(/\b(\d{1,2})\s+([A-Za-z]{3,})\s+(\d{4})\b/))
+                on_notice_to_raw = "#{dm[1]} #{dm[2]} #{dm[3]}"
+                on_notice_to     = Util.parse_aus_date(on_notice_to_raw)
+            end
+        end
+
+        if doc_url.nil?
+            a = fwd.at_css("a[href]")
+            if a && a["href"].to_s =~ /\.pdf/i
+                doc_url = a["href"].strip
+            end
+        end
+    end
+
+    begin
+        DB.upsert(TABLE, {
+            council_reference: ref,
+            address:           address[0, 255],
+            description:       description.to_s,
+            date_received:     nil,
+            date_received_raw: "",
+            on_notice_to:      on_notice_to,
+            on_notice_to_raw:  on_notice_to_raw,
+            document_url:      doc_url,
+            applicant:         "",
+            owner:             ""
+        })
+
+        enrich_after_upsert!(
+            table:             TABLE,
+            council_reference: ref,
+            address:           address
+        )
+
+        Log.info "kingisland", "Upserted #{ref} -> #{address}"
+        saved += 1
+    rescue StandardError => e
+        Log.warn "kingisland", "DB error for #{ref}: #{e.class} #{e.message}"
+    end
+end
 
-DB.ensure_table!(TABLE)
+puts "Done #{TABLE}. Saved #{saved} item(s)."