| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- # King Island Council — Advertised Development Applications
- #
- # Source: https://kingisland.tas.gov.au/develop/planning/
- #
- # The site returns HTTP 403 on direct requests but succeeds after a homepage
- # warmup using browser-like headers (same technique as burnie.rb).
- # Accept-Encoding: identity is used to avoid gzip decompression complexity.
- #
- # Page structure (WordPress accordion, id="accordion-1-c4"):
- # <h2>Advertised development applications</h2>
- # <p class="entry-title">...(preamble)...</p>
- # <p>Notice of Planning Application – DA 2025/28 15 Kurrajong Street,
- # Grassy, TAS 7256 – Visitor/workers' Accommodation.</p>
- # <p>...representations no later than 2 April 2026...</p>
- # <p><a href="https://kingisland.tas.gov.au/wp-content/uploads/DA-2025-28-...pdf">here</a></p>
- require "date"
- require "nokogiri"
- require "net/http"
- require "uri"
- require_relative "../lib/db"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- require_relative "../lib/util"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_kingisland
- BASE_URL = "https://kingisland.tas.gov.au"
- URL = "#{BASE_URL}/develop/planning/"
- DB.ensure_table!(TABLE)
- # ----- Browser-like headers (WAF warmup technique from burnie.rb) -----
- UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
- "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
- BASE_HEADERS = {
- "User-Agent" => UA,
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language" => "en-AU,en;q=0.8",
- "Accept-Encoding" => "identity",
- "Upgrade-Insecure-Requests" => "1",
- "Sec-Fetch-Dest" => "document",
- "Sec-Fetch-Mode" => "navigate",
- "Sec-Fetch-Site" => "none",
- "Sec-Fetch-User" => "?1",
- "sec-ch-ua" => '"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"',
- "sec-ch-ua-platform" => '"Windows"',
- "sec-ch-ua-mobile" => "?0",
- "Connection" => "close",
- }.freeze
- class CookieJar
- def initialize; @h = {}; end
- def for(host)
- @h[host] || ""
- end
- def merge_from(resp, host)
- cookies = resp.get_fields("Set-Cookie") || []
- return if cookies.empty?
- existing = parse_header(@h[host])
- cookies.each do |sc|
- kv = sc.split(";", 2).first
- k, v = kv.split("=", 2)
- existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
- end
- @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
- end
- private
- def parse_header(s)
- s.to_s.split(";").map(&:strip).filter_map { |kv|
- k, v = kv.split("=", 2)
- [k, v] unless k.to_s.empty?
- }.to_h
- end
- end
- def http_get(url, jar:, referer: nil, fetch_site: "none")
- uri = URI(url)
- hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
- hdrs["Referer"] = referer if referer
- cookie = jar.for(uri.host)
- hdrs["Cookie"] = cookie unless cookie.empty?
- limit = 5
- code = 0
- body = ""
- while limit > 0
- req = Net::HTTP::Get.new(uri, hdrs)
- Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
- resp = http.request(req)
- jar.merge_from(resp, uri.host)
- code = resp.code.to_i
- if [301, 302, 303, 307, 308].include?(code) && resp["location"]
- uri = URI.join(uri, resp["location"])
- limit -= 1
- next
- end
- body = resp.body.to_s
- end
- break
- end
- [code, body]
- rescue StandardError => e
- Log.warn "kingisland", "HTTP error for #{url}: #{e.class} #{e.message}"
- [0, ""]
- end
- # ----- Warmup: hit homepage first to get cookies, then fetch planning page -----
- jar = CookieJar.new
- Log.info "kingisland", "Warming up via homepage..."
- code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
- Log.info "kingisland", "Homepage: #{code0}"
- sleep(0.5)
- Log.info "kingisland", "Fetching planning page..."
- code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
- Log.info "kingisland", "Planning page: #{code1} (#{html.bytesize} bytes)"
- if code1 != 200 || html.bytesize < 5_000
- Log.warn "kingisland", "Could not fetch planning page (status #{code1}). " \
- "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
- exit 0
- end
- if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
- Log.warn "kingisland", "Cloudflare challenge returned. " \
- "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
- exit 0
- end
- # ----- Parse -----
- # Ref format: DA 2025/28 (year/sequential)
- REF_RX = /\bDA\s*\d{4}\/\d{1,4}\b/i
- doc = Nokogiri::HTML(html)
- # The advertised applications are inside div#accordion-1-c4.
- # If the div id ever changes, fall back to finding the h2 by text.
- section = doc.at_css("div#accordion-1-c4") ||
- doc.xpath('//h2[contains(translate(., "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "advertised development")]')&.parent
- unless section
- Log.warn "kingisland", "Could not find 'Advertised development applications' section on page."
- puts "Done #{TABLE}. Saved 0 item(s)."
- exit 0
- end
- paragraphs = section.css("p").to_a
- saved = 0
- paragraphs.each_with_index do |para, idx|
- text = para.text.gsub(/[[:space:]]+/, " ").strip
- next unless (m = text.match(REF_RX))
- ref = m[0].strip.gsub(/\s+/, " ")
- # Strip any "Notice of Planning Application" prefix and the ref itself,
- # leaving "ADDRESS – DESCRIPTION."
- rest = text
- .sub(/Notice\s+of\s+Planning\s+Application\s*[-\u2013\u2014]?\s*/i, "")
- .sub(ref, "")
- .gsub(/\A[\s\-\u2013\u2014]+/, "")
- .gsub(/[.\s]+\z/, "")
- # Split at last " – " (en-dash) or " - " to separate address from description
- if (split_idx = rest.rindex(/\s[\-\u2013\u2014]\s/))
- address = rest[0, split_idx].strip
- description = rest[(split_idx + 1)..]&.gsub(/\A[\s\-\u2013\u2014]+/, "")&.strip
- else
- address = rest.strip
- description = "Development Application"
- end
- next if address.empty?
- # Scan forward up to 5 paragraphs for closing date and PDF link
- on_notice_to_raw = ""
- on_notice_to = nil
- doc_url = nil
- (1..5).each do |offset|
- break if idx + offset >= paragraphs.length
- fwd = paragraphs[idx + offset]
- fwd_text = fwd.text.gsub(/[[:space:]]+/, " ").strip
- if on_notice_to_raw.empty? && fwd_text =~ /no\s+later\s+than|representations|closing/i
- if (dm = fwd_text.match(/\b(\d{1,2})\s+([A-Za-z]{3,})\s+(\d{4})\b/))
- on_notice_to_raw = "#{dm[1]} #{dm[2]} #{dm[3]}"
- on_notice_to = Util.parse_aus_date(on_notice_to_raw)
- end
- end
- if doc_url.nil?
- a = fwd.at_css("a[href]")
- if a && a["href"].to_s =~ /\.pdf/i
- doc_url = a["href"].strip
- end
- end
- end
- begin
- DB.upsert(TABLE, {
- council_reference: ref,
- address: address[0, 255],
- description: description.to_s,
- date_received: nil,
- date_received_raw: "",
- on_notice_to: on_notice_to,
- on_notice_to_raw: on_notice_to_raw,
- document_url: doc_url,
- applicant: "",
- owner: ""
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: ref,
- address: address
- )
- Log.info "kingisland", "Upserted #{ref} -> #{address}"
- saved += 1
- rescue StandardError => e
- Log.warn "kingisland", "DB error for #{ref}: #{e.class} #{e.message}"
- end
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|