benjamin.harris
/
tas_councils


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
							# Burnie City Council — permit applications on exhibition (robust / WAF-aware + PDF download)

require "date"
require "nokogiri"
require "cgi"
require "fileutils"
require "net/http"
require "uri"
require "zlib"
require "stringio"
require "base64"
require "securerandom"


require_relative "../lib/enrich"
require_relative "../lib/log"
TABLE    = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_burnie
BASE_URL = "https://www.burnie.tas.gov.au"
URL      = "#{BASE_URL}/Development/Planning/Permit-applications-on-exhibition"
URL_EN   = "#{URL}?oc_lang=en-AU"

DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"

DB.ensure_table!(TABLE)

# ----- HTTP helpers (browser-y headers + cookie jar + gzip/deflate) -----
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\
     "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"

SEC_CH_UA          = %q{"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"}
SEC_CH_UA_PLATFORM = %q{"Windows"}
SEC_CH_UA_MOBILE   = "?0"

BASE_HEADERS = {
  "User-Agent"                => UA,
  "Accept"                    => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"           => "en-AU,en;q=0.8",
  # Avoid Brotli (Ruby stdlib won’t auto-decode it)
  "Accept-Encoding"           => "gzip,deflate",
  "Upgrade-Insecure-Requests" => "1",
  "Sec-Fetch-Dest"            => "document",
  "Sec-Fetch-Mode"            => "navigate",
  "Sec-Fetch-Site"            => "none",
  "Sec-Fetch-User"            => "?1",
  "sec-ch-ua"                 => SEC_CH_UA,
  "sec-ch-ua-platform"        => SEC_CH_UA_PLATFORM,
  "sec-ch-ua-mobile"          => SEC_CH_UA_MOBILE,
  "Pragma"                    => "no-cache",
  "Cache-Control"             => "no-cache",
  "Connection"                => "close",
}.freeze

# Very small cookie jar (domain -> cookie string)
class Jar
  def initialize; @h = {}; end
  def for(host)
    @h[host] || ""
  end
  def merge_from(resp, host)
    cookies = resp.get_fields("Set-Cookie") || []
    return if cookies.empty?
    existing = parse_cookie_header(@h[host])
    cookies.each do |sc|
      kv = sc.split(";", 2).first
      k, v = kv.split("=", 2)
      next if k.to_s.empty?
      existing[k] = v.to_s
    end
    @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
  end
  def parse_cookie_header(s)
    s.to_s.split(";").map(&:strip).map { |kv|
      k, v = kv.split("=", 2); [k, v]
    }.select { |k, _| !k.to_s.empty? }.to_h
  end
end

def decompress(body, enc)
  return body if body.nil? || body.empty?
  if enc.to_s =~ /gzip/i
    Zlib::GzipReader.new(StringIO.new(body)).read
  elsif enc.to_s =~ /deflate/i
    begin
      Zlib::Inflate.inflate(body)
    rescue Zlib::Error
      body
    end
  else
    body
  end
rescue Zlib::Error
  body
end

def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "none")
  uri  = URI(url)
  hdrs = BASE_HEADERS.merge(headers)
  hdrs["Referer"]        = referer if referer
  hdrs["Sec-Fetch-Site"] = site_fetch
  cookie = jar.for(uri.host)
  hdrs["Cookie"] = cookie unless cookie.empty?

  limit = 5
  enc   = ""
  msg   = ""
  code  = 0
  body  = ""

  while limit > 0
    req = Net::HTTP::Get.new(uri, hdrs)
    Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
      resp = http.request(req)
      jar.merge_from(resp, uri.host)
      enc  = resp["content-encoding"].to_s
      msg  = resp.message
      code = resp.code.to_i

      if [301, 302, 303, 307, 308].include?(code) && resp["location"]
        uri = URI.join(uri, resp["location"])
        limit -= 1
        next
      end

      # For HTML we decompress; for PDF we only requested gzip/deflate off,
      # so this remains identity unless server forces it (we still handle).
      body = decompress(resp.body.to_s, enc)
    end
    break
  end

  [code, body, enc, msg]
end

def short_sleep
  sleep(0.4 + rand * 0.6)
end

# ----- Burnie-specific parsing helpers -----
REF_RX = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i

def extract_ref(text)
  if (m = text.to_s.match(REF_RX))
    "DA #{m[1]} / #{m[2]}"
  end
end

def normalize_ref(text)
  extract_ref(text) ||
    text.to_s[/\bDA\s*[12]\d{3}\s*\/\s*[A-Za-z0-9\-_.]+\b/i].to_s.gsub(/\s*\/\s*/, " / ").strip
end

def extract_on_notice_date(text)
  s = text.to_s.gsub(/\s+/, " ")
  if (m = s.match(/\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b/))
    m[0]
  elsif (m = s.match(/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/))
    m[0]
  else
    ""
  end
end

def first_pdf_on_detail(detail_url, jar)
  code, html, _enc, _msg = http_get_with_cookies(
    detail_url,
    jar: jar,
    site_fetch: "same-origin",
    referer: URL_EN
  )
  return "" unless code == 200
  doc = Nokogiri::HTML(html)
  # Prefer explicit doc buttons if present
  a = doc.at_css(".hyperlink-button-container a.ext-pdf") ||
      doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")
  return "" unless a
  URI.join(detail_url, a["href"].to_s).to_s
rescue StandardError => e
  Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  ""
end

def decode_seamless_viewstate(doc)
  b64 = doc.at_css("#__SEAMLESSVIEWSTATE")&.[]("value").to_s
  return nil if b64.empty?
  raw  = Base64.decode64(b64)
  html = begin
    Zlib::GzipReader.new(StringIO.new(raw)).read
  rescue Zlib::Error
    raw
  end
  Nokogiri::HTML(html)
rescue StandardError => e
  Log.warn "scraper", "Failed to decode __SEAMLESSVIEWSTATE: #{e.class} #{e.message}"
  nil
end

def sanitize_filename(s)
  s.to_s.gsub(/[^\w.\-]+/, "_")[0, 180]
end

def save_pdf(document_url, council_reference, jar, referer:)
  return if document_url.to_s.strip.empty?
  return unless DOWNLOAD_ATTACHMENTS

  # Decide filename
  url_path   = URI.parse(document_url).path rescue "/document.pdf"
  base_name  = File.basename(url_path)
  safe_base  = sanitize_filename(base_name)
  # Prefix with reference for uniqueness & traceability
  prefix     = sanitize_filename(council_reference.to_s.gsub(" / ", "-"))
  file_name  = "#{prefix}__#{safe_base}"
  out_dir    = File.join(DOWNLOAD_DIR, TABLE)
  out_path   = File.join(out_dir, file_name)

  FileUtils.mkdir_p(out_dir)

  code, data, _enc, msg = http_get_with_cookies(
    document_url,
    jar: jar,
    headers: {
      # Ask for PDF explicitly
      "Accept"          => "application/pdf,*/*;q=0.8",
      "Accept-Encoding" => "identity" # avoid gzip'd binary when possible
    },
    referer: referer,
    site_fetch: "same-origin"
  )

  if code == 200 && data && data.bytesize > 0
    File.open(out_path, "wb") { |f| f.write(data) }
    puts "Saved PDF to #{out_path} (#{data.bytesize} bytes)"
  else
    Log.warn "scraper", "PDF fetch failed (#{code} #{msg}) for #{document_url}"
  end
rescue StandardError => e
  Log.warn "scraper", "PDF save error for #{document_url}: #{e.class} #{e.message}"
end

# ----- Warm-up sequence to appease WAF -----
jar = Jar.new

# 1) Direct try
code1, body1, enc1, msg1 = http_get_with_cookies(URL, jar: jar)
puts "List fetch #1: status=#{code1} #{msg1}, enc=#{enc1}, bytes=#{body1.to_s.bytesize}"

html = nil

if code1 == 200 && body1.bytesize > 5_000
  html = body1
else
  short_sleep
  # 2) Language variant (often works)
  code2, body2, enc2, msg2 = http_get_with_cookies(
    URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/"
  )
  puts "List fetch #2: status=#{code2} #{msg2}, enc=#{enc2}, bytes=#{body2.to_s.bytesize} (#{URL_EN})"
  if code2 == 200 && body2.bytesize > 5_000
    html = body2
  else
    # 3) Warm up by hitting Home (sets benign cookies), then websitesettings.js, then retry
    short_sleep
    h_code, _h_body, _h_enc, h_msg = http_get_with_cookies("#{BASE_URL}/Home", jar: jar, site_fetch: "none")
    puts "Warmup Home: status=#{h_code} #{h_msg}"
    short_sleep
    oc_api = "#{BASE_URL}/ocapi/0ff2db3d-0235-40e2-b373-42294eee3a55/en-AU/websitesettings.js"
    w_code, _w_body, _w_enc, w_msg = http_get_with_cookies(oc_api, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home")
    puts "Warmup websitesettings.js: status=#{w_code} #{w_msg}"
    short_sleep
    code3, body3, enc3, msg3 = http_get_with_cookies(URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home")
    puts "List fetch #3: status=#{code3} #{msg3}, enc=#{enc3}, bytes=#{body3.to_s.bytesize} (retry with referer)"
    html = body3 if code3 == 200 && body3.bytesize > 5_000
  end
end

# Fall back to whatever we got first if nothing passed threshold
html ||= body1

puts "Fetched list page (#{html.to_s.bytesize} bytes)"
list_doc = Nokogiri::HTML(html)

# Try visible DOM first
nodes = list_doc.css(".list-container.da-list-container .list-item-container a[href]")
puts "Primary selector found #{nodes.length} anchors"

# If nothing, decode Seamless payload and try again
if nodes.empty?
  sv_doc = decode_seamless_viewstate(list_doc)
  if sv_doc
    nodes = sv_doc.css(".list-container.da-list-container .list-item-container a[href]")
    puts "Seamless ViewState selector found #{nodes.length} anchors"
    if nodes.empty?
      nodes = sv_doc.css(".list-item-container").map { |c| c.at_css("a[href]") }.compact
      puts "Seamless final fallback found #{nodes.length} anchors"
    end
  else
    puts "__SEAMLESSVIEWSTATE not found or could not be decoded"
  end
end

puts "Found #{nodes.length} application(s) for #{TABLE}"
saved = 0

nodes.each do |a|
  detail_url = URI.join(URL, a["href"].to_s).to_s

  ref_text          = a.at_css(".da-application-number")&.text.to_s
  council_reference = normalize_ref(ref_text)

  address           = a.at_css(".list-item-address")&.text.to_s.strip

  closing_text      = a.at_css(".display-until-date")&.text.to_s
  on_notice_to_raw  = if closing_text.empty?
                        extract_on_notice_date(a.text)
                      else
                        extract_on_notice_date(closing_text.sub(/^On display until\s*/i, ""))
                      end
  on_notice_to      = Util.parse_aus_date(on_notice_to_raw)
  date_received     = on_notice_to ? (on_notice_to - 14) : nil

  # First <p> that isn't a helper class = description
  desc_p = a.css("p").find { |p|
    cls = p["class"].to_s
    cls.empty? || !(cls =~ /(da-application-number|list-item-address|display-until)/)
  }
  description = desc_p&.text.to_s.strip
  description = "Development Application" if description.empty?

  next if address.empty? || council_reference.empty?

  document_url = first_pdf_on_detail(detail_url, jar)

  # Download the PDF if requested
  save_pdf(document_url, council_reference, jar, referer: detail_url) if DOWNLOAD_ATTACHMENTS

  DB.upsert(TABLE, {
    description:       description,
    date_received:     date_received,
    date_received_raw: on_notice_to_raw,  # keep the raw on-notice text
    on_notice_to:      on_notice_to,      # store close/on-notice date here
    on_notice_to_raw:  on_notice_to_raw,
    address:           address,
    council_reference: council_reference,
    applicant:         "",
    owner:             ""
  })

  enrich_after_upsert!(
    table: TABLE,
    council_reference: council_reference,
    address: address
  )

  begin
    upd = DB.client.prepare(
      "UPDATE `#{DB.client.escape(TABLE)}` " \
      "SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \
      "WHERE council_reference = ? AND address = ?"
    )
    title_reference = a.at_css(".list-item-title")&.text&.strip.to_s
    upd.execute(document_url, on_notice_to, on_notice_to_raw, title_reference, council_reference, address)
  rescue StandardError => e
    Log.warn "scraper", "Extra fields update skipped for #{council_reference}: #{e.class} #{e.message}"
  end

  puts "Upserted #{council_reference} -> #{address}"
  saved += 1
end

puts "Done #{TABLE}. Saved #{saved} item(s)."