# Burnie City Council — permit applications on exhibition (robust / WAF-aware + PDF download) require "date" require "nokogiri" require "cgi" require "fileutils" require "net/http" require "uri" require "zlib" require "stringio" require "base64" require "securerandom" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_burnie BASE_URL = "https://www.burnie.tas.gov.au" URL = "#{BASE_URL}/Development/Planning/Permit-applications-on-exhibition" URL_EN = "#{URL}?oc_lang=en-AU" DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" DB.ensure_table!(TABLE) # ----- HTTP helpers (browser-y headers + cookie jar + gzip/deflate) ----- UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" SEC_CH_UA = %q{"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"} SEC_CH_UA_PLATFORM = %q{"Windows"} SEC_CH_UA_MOBILE = "?0" BASE_HEADERS = { "User-Agent" => UA, "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language" => "en-AU,en;q=0.8", # Avoid Brotli (Ruby stdlib won't auto-decode it) "Accept-Encoding" => "gzip,deflate", "Upgrade-Insecure-Requests" => "1", "Sec-Fetch-Dest" => "document", "Sec-Fetch-Mode" => "navigate", "Sec-Fetch-Site" => "none", "Sec-Fetch-User" => "?1", "sec-ch-ua" => SEC_CH_UA, "sec-ch-ua-platform" => SEC_CH_UA_PLATFORM, "sec-ch-ua-mobile" => SEC_CH_UA_MOBILE, "Pragma" => "no-cache", "Cache-Control" => "no-cache", "Connection" => "close", }.freeze # Very small cookie jar (domain -> cookie string) class Jar def initialize; @h = {}; end def for(host) @h[host] || "" end def merge_from(resp, host) cookies = resp.get_fields("Set-Cookie") || [] return if cookies.empty? existing = parse_cookie_header(@h[host]) cookies.each do |sc| kv = sc.split(";", 2).first k, v = kv.split("=", 2) next if k.to_s.empty? existing[k] = v.to_s end @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ") end def parse_cookie_header(s) s.to_s.split(";").map(&:strip).map { |kv| k, v = kv.split("=", 2); [k, v] }.select { |k, _| !k.to_s.empty? }.to_h end end def decompress(body, enc) return body if body.nil? || body.empty? if enc.to_s =~ /gzip/i Zlib::GzipReader.new(StringIO.new(body)).read elsif enc.to_s =~ /deflate/i begin Zlib::Inflate.inflate(body) rescue Zlib::Error body end else body end rescue Zlib::Error body end def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "none") uri = URI(url) hdrs = BASE_HEADERS.merge(headers) hdrs["Referer"] = referer if referer hdrs["Sec-Fetch-Site"] = site_fetch cookie = jar.for(uri.host) hdrs["Cookie"] = cookie unless cookie.empty? limit = 5 enc = "" msg = "" code = 0 body = "" while limit > 0 limit -= 1 redirect_to = nil req = Net::HTTP::Get.new(uri, hdrs) Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http| resp = http.request(req) jar.merge_from(resp, uri.host) enc = resp["content-encoding"].to_s msg = resp.message code = resp.code.to_i if [301, 302, 303, 307, 308].include?(code) && resp["location"] # Flag the redirect so the while loop can retry; `next` here only # exits the Net::HTTP.start block, not the while loop. redirect_to = URI.join(uri, resp["location"]) else # For HTML we decompress; for PDF we only requested gzip/deflate off, # so this remains identity unless server forces it (we still handle). body = decompress(resp.body.to_s, enc) end end if redirect_to uri = redirect_to next end break end [code, body, enc, msg] end def short_sleep sleep(0.4 + rand * 0.6) end # ----- Burnie-specific parsing helpers ----- REF_RX = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i def extract_ref(text) if (m = text.to_s.match(REF_RX)) "DA #{m[1]} / #{m[2]}" end end def normalize_ref(text) extract_ref(text) || text.to_s[/\bDA\s*[12]\d{3}\s*\/\s*[A-Za-z0-9\-_.]+\b/i].to_s.gsub(/\s*\/\s*/, " / ").strip end def extract_on_notice_date(text) s = text.to_s.gsub(/\s+/, " ") if (m = s.match(/\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b/)) m[0] elsif (m = s.match(/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/)) m[0] else "" end end def first_pdf_on_detail(detail_url, jar) code, html, _enc, _msg = http_get_with_cookies( detail_url, jar: jar, site_fetch: "same-origin", referer: URL_EN ) return "" unless code == 200 doc = Nokogiri::HTML(html) # Prefer explicit doc buttons if present a = doc.at_css(".hyperlink-button-container a.ext-pdf") || doc.at_css("a[href$='.pdf'], a[href*='.pdf?']") return "" unless a href = a["href"].to_s # Percent-encode non-ASCII characters (e.g. en-dash in filename) so URI.join # doesn't raise URI::InvalidURIError. ASCII-safe characters are left as-is. href = href.gsub(/[^\x00-\x7F]/) { |c| URI::DEFAULT_PARSER.escape(c) } URI.join(detail_url, href).to_s rescue StandardError => e Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}" "" end def decode_seamless_viewstate(doc) b64 = doc.at_css("#__SEAMLESSVIEWSTATE")&.[]("value").to_s return nil if b64.empty? raw = Base64.decode64(b64) html = begin Zlib::GzipReader.new(StringIO.new(raw)).read rescue Zlib::Error raw end Nokogiri::HTML(html) rescue StandardError => e Log.warn "scraper", "Failed to decode __SEAMLESSVIEWSTATE: #{e.class} #{e.message}" nil end def sanitize_filename(s) s.to_s.gsub(/[^\w.\-]+/, "_")[0, 180] end def save_pdf(document_url, council_reference, jar, referer:) return if document_url.to_s.strip.empty? return unless DOWNLOAD_ATTACHMENTS # Decide filename url_path = URI.parse(document_url).path rescue "/document.pdf" base_name = File.basename(url_path) safe_base = sanitize_filename(base_name) # Prefix with reference for uniqueness & traceability prefix = sanitize_filename(council_reference.to_s.gsub(" / ", "-")) file_name = "#{prefix}__#{safe_base}" out_dir = File.join(DOWNLOAD_DIR, TABLE) out_path = File.join(out_dir, file_name) FileUtils.mkdir_p(out_dir) code, data, _enc, msg = http_get_with_cookies( document_url, jar: jar, headers: { # Ask for PDF explicitly "Accept" => "application/pdf,*/*;q=0.8", "Accept-Encoding" => "identity" # avoid gzip'd binary when possible }, referer: referer, site_fetch: "same-origin" ) if code == 200 && data && data.bytesize > 0 File.open(out_path, "wb") { |f| f.write(data) } puts "Saved PDF to #{out_path} (#{data.bytesize} bytes)" else Log.warn "scraper", "PDF fetch failed (#{code} #{msg}) for #{document_url}" end rescue StandardError => e Log.warn "scraper", "PDF save error for #{document_url}: #{e.class} #{e.message}" end # ----- Warm-up sequence to appease WAF ----- jar = Jar.new # 1) Direct try code1, body1, enc1, msg1 = http_get_with_cookies(URL, jar: jar) puts "List fetch #1: status=#{code1} #{msg1}, enc=#{enc1}, bytes=#{body1.to_s.bytesize}" html = nil if code1 == 200 && body1.bytesize > 5_000 html = body1 else short_sleep # 2) Language variant (often works) code2, body2, enc2, msg2 = http_get_with_cookies( URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/" ) puts "List fetch #2: status=#{code2} #{msg2}, enc=#{enc2}, bytes=#{body2.to_s.bytesize} (#{URL_EN})" if code2 == 200 && body2.bytesize > 5_000 html = body2 else # 3) Warm up by hitting Home (sets benign cookies), then websitesettings.js, then retry short_sleep h_code, _h_body, _h_enc, h_msg = http_get_with_cookies("#{BASE_URL}/Home", jar: jar, site_fetch: "none") puts "Warmup Home: status=#{h_code} #{h_msg}" short_sleep oc_api = "#{BASE_URL}/ocapi/0ff2db3d-0235-40e2-b373-42294eee3a55/en-AU/websitesettings.js" w_code, _w_body, _w_enc, w_msg = http_get_with_cookies(oc_api, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home") puts "Warmup websitesettings.js: status=#{w_code} #{w_msg}" short_sleep code3, body3, enc3, msg3 = http_get_with_cookies(URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home") puts "List fetch #3: status=#{code3} #{msg3}, enc=#{enc3}, bytes=#{body3.to_s.bytesize} (retry with referer)" html = body3 if code3 == 200 && body3.bytesize > 5_000 end end # Fall back to whatever we got first if nothing passed threshold html ||= body1 puts "Fetched list page (#{html.to_s.bytesize} bytes)" list_doc = Nokogiri::HTML(html) # Try visible DOM first nodes = list_doc.css(".list-container.da-list-container .list-item-container a[href]") puts "Primary selector found #{nodes.length} anchors" # If nothing, decode Seamless payload and try again if nodes.empty? sv_doc = decode_seamless_viewstate(list_doc) if sv_doc nodes = sv_doc.css(".list-container.da-list-container .list-item-container a[href]") puts "Seamless ViewState selector found #{nodes.length} anchors" if nodes.empty? nodes = sv_doc.css(".list-item-container").map { |c| c.at_css("a[href]") }.compact puts "Seamless final fallback found #{nodes.length} anchors" end else puts "__SEAMLESSVIEWSTATE not found or could not be decoded" end end puts "Found #{nodes.length} application(s) for #{TABLE}" saved = 0 nodes.each do |a| detail_url = URI.join(URL, a["href"].to_s).to_s ref_text = a.at_css(".da-application-number")&.text.to_s council_reference = normalize_ref(ref_text) address = a.at_css(".list-item-address")&.text.to_s.strip closing_text = a.at_css(".display-until-date")&.text.to_s on_notice_to_raw = if closing_text.empty? extract_on_notice_date(a.text) else extract_on_notice_date(closing_text.sub(/^On display until\s*/i, "")) end on_notice_to = Util.parse_aus_date(on_notice_to_raw) date_received = on_notice_to ? (on_notice_to - 14) : nil # First
that isn't a helper class = description desc_p = a.css("p").find { |p| cls = p["class"].to_s cls.empty? || !(cls =~ /(da-application-number|list-item-address|display-until)/) } description = desc_p&.text.to_s.strip description = "Development Application" if description.empty? next if address.empty? || council_reference.empty? document_url = first_pdf_on_detail(detail_url, jar) # Download the PDF if requested save_pdf(document_url, council_reference, jar, referer: detail_url) if DOWNLOAD_ATTACHMENTS DB.upsert(TABLE, { description: description, date_received: date_received, date_received_raw: on_notice_to_raw, # keep the raw on-notice text on_notice_to: on_notice_to, # store close/on-notice date here on_notice_to_raw: on_notice_to_raw, address: address, council_reference: council_reference, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare( "UPDATE `#{DB.client.escape(TABLE)}` " \ "SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \ "WHERE council_reference = ? AND address = ?" ) title_reference = a.at_css(".list-item-title")&.text&.strip.to_s upd.execute(document_url, on_notice_to, on_notice_to_raw, title_reference, council_reference, address) rescue StandardError => e Log.warn "scraper", "Extra fields update skipped for #{council_reference}: #{e.class} #{e.message}" end puts "Upserted #{council_reference} -> #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."