|
|
@@ -1,11 +1,239 @@
|
|
|
-# George Town Council — Development Applications (site page, not PlanBuild)
|
|
|
+# King Island Council — Advertised Development Applications
|
|
|
+#
|
|
|
+# Source: https://kingisland.tas.gov.au/develop/planning/
|
|
|
+#
|
|
|
+# The site returns HTTP 403 on direct requests but succeeds after a homepage
|
|
|
+# warmup using browser-like headers (same technique as burnie.rb).
|
|
|
+# Accept-Encoding: identity is used to avoid gzip decompression complexity.
|
|
|
+#
|
|
|
+# Page structure (WordPress accordion, id="accordion-1-c4"):
|
|
|
+# <h2>Advertised development applications</h2>
|
|
|
+# <p class="entry-title">...(preamble)...</p>
|
|
|
+# <p>Notice of Planning Application – DA 2025/28 15 Kurrajong Street,
|
|
|
+# Grassy, TAS 7256 – Visitor/workers' Accommodation.</p>
|
|
|
+# <p>...representations no later than 2 April 2026...</p>
|
|
|
+# <p><a href="https://kingisland.tas.gov.au/wp-content/uploads/DA-2025-28-...pdf">here</a></p>
|
|
|
|
|
|
+require "date"
|
|
|
require "nokogiri"
|
|
|
-require_relative "../lib/http"
|
|
|
+require "net/http"
|
|
|
+require "uri"
|
|
|
+
|
|
|
+require_relative "../lib/db"
|
|
|
+require_relative "../lib/enrich"
|
|
|
+require_relative "../lib/log"
|
|
|
require_relative "../lib/util"
|
|
|
-require_relative "../lib/scraper_helpers"
|
|
|
|
|
|
-TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_georgetown
|
|
|
-URL = "https://kingisland.tas.gov.au/develop/planning/"
|
|
|
+TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_kingisland
|
|
|
+BASE_URL = "https://kingisland.tas.gov.au"
|
|
|
+URL = "#{BASE_URL}/develop/planning/"
|
|
|
+
|
|
|
+DB.ensure_table!(TABLE)
|
|
|
+
|
|
|
+# ----- Browser-like headers (WAF warmup technique from burnie.rb) -----
|
|
|
+UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
|
|
|
+ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
|
+
|
|
|
+BASE_HEADERS = {
|
|
|
+ "User-Agent" => UA,
|
|
|
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
+ "Accept-Language" => "en-AU,en;q=0.8",
|
|
|
+ "Accept-Encoding" => "identity",
|
|
|
+ "Upgrade-Insecure-Requests" => "1",
|
|
|
+ "Sec-Fetch-Dest" => "document",
|
|
|
+ "Sec-Fetch-Mode" => "navigate",
|
|
|
+ "Sec-Fetch-Site" => "none",
|
|
|
+ "Sec-Fetch-User" => "?1",
|
|
|
+ "sec-ch-ua" => '"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"',
|
|
|
+ "sec-ch-ua-platform" => '"Windows"',
|
|
|
+ "sec-ch-ua-mobile" => "?0",
|
|
|
+ "Connection" => "close",
|
|
|
+}.freeze
|
|
|
+
|
|
|
+class CookieJar
|
|
|
+ def initialize; @h = {}; end
|
|
|
+
|
|
|
+ def for(host)
|
|
|
+ @h[host] || ""
|
|
|
+ end
|
|
|
+
|
|
|
+ def merge_from(resp, host)
|
|
|
+ cookies = resp.get_fields("Set-Cookie") || []
|
|
|
+ return if cookies.empty?
|
|
|
+ existing = parse_header(@h[host])
|
|
|
+ cookies.each do |sc|
|
|
|
+ kv = sc.split(";", 2).first
|
|
|
+ k, v = kv.split("=", 2)
|
|
|
+ existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
|
|
|
+ end
|
|
|
+ @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
|
|
|
+ end
|
|
|
+
|
|
|
+ private
|
|
|
+
|
|
|
+ def parse_header(s)
|
|
|
+ s.to_s.split(";").map(&:strip).filter_map { |kv|
|
|
|
+ k, v = kv.split("=", 2)
|
|
|
+ [k, v] unless k.to_s.empty?
|
|
|
+ }.to_h
|
|
|
+ end
|
|
|
+end
|
|
|
+
|
|
|
+def http_get(url, jar:, referer: nil, fetch_site: "none")
|
|
|
+ uri = URI(url)
|
|
|
+ hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
|
|
|
+ hdrs["Referer"] = referer if referer
|
|
|
+ cookie = jar.for(uri.host)
|
|
|
+ hdrs["Cookie"] = cookie unless cookie.empty?
|
|
|
+
|
|
|
+ limit = 5
|
|
|
+ code = 0
|
|
|
+ body = ""
|
|
|
+
|
|
|
+ while limit > 0
|
|
|
+ req = Net::HTTP::Get.new(uri, hdrs)
|
|
|
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
|
+ resp = http.request(req)
|
|
|
+ jar.merge_from(resp, uri.host)
|
|
|
+ code = resp.code.to_i
|
|
|
+
|
|
|
+ if [301, 302, 303, 307, 308].include?(code) && resp["location"]
|
|
|
+ uri = URI.join(uri, resp["location"])
|
|
|
+ limit -= 1
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ body = resp.body.to_s
|
|
|
+ end
|
|
|
+ break
|
|
|
+ end
|
|
|
+
|
|
|
+ [code, body]
|
|
|
+rescue StandardError => e
|
|
|
+ Log.warn "kingisland", "HTTP error for #{url}: #{e.class} #{e.message}"
|
|
|
+ [0, ""]
|
|
|
+end
|
|
|
+
|
|
|
+# ----- Warmup: hit homepage first to get cookies, then fetch planning page -----
|
|
|
+jar = CookieJar.new
|
|
|
+
|
|
|
+Log.info "kingisland", "Warming up via homepage..."
|
|
|
+code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
|
|
|
+Log.info "kingisland", "Homepage: #{code0}"
|
|
|
+
|
|
|
+sleep(0.5)
|
|
|
+
|
|
|
+Log.info "kingisland", "Fetching planning page..."
|
|
|
+code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
|
|
|
+Log.info "kingisland", "Planning page: #{code1} (#{html.bytesize} bytes)"
|
|
|
+
|
|
|
+if code1 != 200 || html.bytesize < 5_000
|
|
|
+ Log.warn "kingisland", "Could not fetch planning page (status #{code1}). " \
|
|
|
+ "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
|
|
|
+ exit 0
|
|
|
+end
|
|
|
+
|
|
|
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
|
|
|
+ Log.warn "kingisland", "Cloudflare challenge returned. " \
|
|
|
+ "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
|
|
|
+ exit 0
|
|
|
+end
|
|
|
+
|
|
|
+# ----- Parse -----
|
|
|
+# Ref format: DA 2025/28 (year/sequential)
|
|
|
+REF_RX = /\bDA\s*\d{4}\/\d{1,4}\b/i
|
|
|
+
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
+
|
|
|
+# The advertised applications are inside div#accordion-1-c4.
|
|
|
+# If the div id ever changes, fall back to finding the h2 by text.
|
|
|
+section = doc.at_css("div#accordion-1-c4") ||
|
|
|
+ doc.xpath('//h2[contains(translate(., "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "advertised development")]')&.parent
|
|
|
+
|
|
|
+unless section
|
|
|
+ Log.warn "kingisland", "Could not find 'Advertised development applications' section on page."
|
|
|
+ puts "Done #{TABLE}. Saved 0 item(s)."
|
|
|
+ exit 0
|
|
|
+end
|
|
|
+
|
|
|
+paragraphs = section.css("p").to_a
|
|
|
+saved = 0
|
|
|
+
|
|
|
+paragraphs.each_with_index do |para, idx|
|
|
|
+ text = para.text.gsub(/[[:space:]]+/, " ").strip
|
|
|
+ next unless (m = text.match(REF_RX))
|
|
|
+
|
|
|
+ ref = m[0].strip.gsub(/\s+/, " ")
|
|
|
+
|
|
|
+ # Strip any "Notice of Planning Application" prefix and the ref itself,
|
|
|
+ # leaving "ADDRESS – DESCRIPTION."
|
|
|
+ rest = text
|
|
|
+ .sub(/Notice\s+of\s+Planning\s+Application\s*[-\u2013\u2014]?\s*/i, "")
|
|
|
+ .sub(ref, "")
|
|
|
+ .gsub(/\A[\s\-\u2013\u2014]+/, "")
|
|
|
+ .gsub(/[.\s]+\z/, "")
|
|
|
+
|
|
|
+ # Split at last " – " (en-dash) or " - " to separate address from description
|
|
|
+ if (split_idx = rest.rindex(/\s[\-\u2013\u2014]\s/))
|
|
|
+ address = rest[0, split_idx].strip
|
|
|
+ description = rest[(split_idx + 1)..]&.gsub(/\A[\s\-\u2013\u2014]+/, "")&.strip
|
|
|
+ else
|
|
|
+ address = rest.strip
|
|
|
+ description = "Development Application"
|
|
|
+ end
|
|
|
+
|
|
|
+ next if address.empty?
|
|
|
+
|
|
|
+ # Scan forward up to 5 paragraphs for closing date and PDF link
|
|
|
+ on_notice_to_raw = ""
|
|
|
+ on_notice_to = nil
|
|
|
+ doc_url = nil
|
|
|
+
|
|
|
+ (1..5).each do |offset|
|
|
|
+ break if idx + offset >= paragraphs.length
|
|
|
+ fwd = paragraphs[idx + offset]
|
|
|
+ fwd_text = fwd.text.gsub(/[[:space:]]+/, " ").strip
|
|
|
+
|
|
|
+ if on_notice_to_raw.empty? && fwd_text =~ /no\s+later\s+than|representations|closing/i
|
|
|
+ if (dm = fwd_text.match(/\b(\d{1,2})\s+([A-Za-z]{3,})\s+(\d{4})\b/))
|
|
|
+ on_notice_to_raw = "#{dm[1]} #{dm[2]} #{dm[3]}"
|
|
|
+ on_notice_to = Util.parse_aus_date(on_notice_to_raw)
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ if doc_url.nil?
|
|
|
+ a = fwd.at_css("a[href]")
|
|
|
+ if a && a["href"].to_s =~ /\.pdf/i
|
|
|
+ doc_url = a["href"].strip
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ begin
|
|
|
+ DB.upsert(TABLE, {
|
|
|
+ council_reference: ref,
|
|
|
+ address: address[0, 255],
|
|
|
+ description: description.to_s,
|
|
|
+ date_received: nil,
|
|
|
+ date_received_raw: "",
|
|
|
+ on_notice_to: on_notice_to,
|
|
|
+ on_notice_to_raw: on_notice_to_raw,
|
|
|
+ document_url: doc_url,
|
|
|
+ applicant: "",
|
|
|
+ owner: ""
|
|
|
+ })
|
|
|
+
|
|
|
+ enrich_after_upsert!(
|
|
|
+ table: TABLE,
|
|
|
+ council_reference: ref,
|
|
|
+ address: address
|
|
|
+ )
|
|
|
+
|
|
|
+ Log.info "kingisland", "Upserted #{ref} -> #{address}"
|
|
|
+ saved += 1
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "kingisland", "DB error for #{ref}: #{e.class} #{e.message}"
|
|
|
+ end
|
|
|
+end
|
|
|
|
|
|
-DB.ensure_table!(TABLE)
|
|
|
+puts "Done #{TABLE}. Saved #{saved} item(s)."
|