# Latrobe Council — Planning Applications on Public Exhibition # # Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp # # Cloudflare is present — requires homepage warmup with browser-like headers # before the planning page responds (same technique as burnie.rb / kingisland.rb). # # Page structure: # require "date" require "nokogiri" require "net/http" require "uri" require_relative "../lib/db" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_latrobe BASE_URL = "https://www.latrobe.tas.gov.au" URL = "#{BASE_URL}/services/building-and-planning-services/planningapp" DB.ensure_table!(TABLE) # ----- Browser-like headers (WAF/Cloudflare warmup) ----- BASE_HEADERS = { "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language" => "en-AU,en;q=0.9", "Accept-Encoding" => "identity", "Upgrade-Insecure-Requests" => "1", "Sec-Fetch-Dest" => "document", "Sec-Fetch-Mode" => "navigate", "Sec-Fetch-Site" => "none", "Sec-Fetch-User" => "?1", "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"', "sec-ch-ua-mobile" => "?0", "sec-ch-ua-platform" => '"Windows"', "Connection" => "close", }.freeze class CookieJar def initialize; @h = {}; end def for(host) @h[host] || "" end def merge_from(resp, host) cookies = resp.get_fields("Set-Cookie") || [] return if cookies.empty? existing = parse_header(@h[host]) cookies.each do |sc| kv = sc.split(";", 2).first k, v = kv.split("=", 2) existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty? end @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ") end private def parse_header(s) s.to_s.split(";").map(&:strip).filter_map { |kv| k, v = kv.split("=", 2) [k, v] unless k.to_s.empty? }.to_h end end def http_get(url, jar:, referer: nil, fetch_site: "none") uri = URI(url) hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site) hdrs["Referer"] = referer if referer cookie = jar.for(uri.host) hdrs["Cookie"] = cookie unless cookie.empty? limit = 5 code = 0 body = "" while limit > 0 limit -= 1 redirect_to = nil req = Net::HTTP::Get.new(uri, hdrs) Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http| resp = http.request(req) jar.merge_from(resp, uri.host) code = resp.code.to_i if [301, 302, 303, 307, 308].include?(code) && resp["location"] redirect_to = URI.join(uri, resp["location"]) else body = resp.body.to_s end end if redirect_to uri = redirect_to next end break end [code, body] rescue StandardError => e Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}" [0, ""] end # ----- Warmup then fetch ----- jar = CookieJar.new Log.info "latrobe", "Warming up via homepage..." code0, _body0 = http_get("#{BASE_URL}/", jar: jar) Log.info "latrobe", "Homepage: #{code0}" sleep(0.5) Log.info "latrobe", "Fetching planning page..." code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin") Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)" if code1 != 200 || html.bytesize < 1_000 Log.warn "latrobe", "Could not fetch planning page (status #{code1})." puts "Done #{TABLE}. Saved 0 item(s)." exit 0 end if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies") Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser." puts "Done #{TABLE}. Saved 0 item(s)." exit 0 end # ----- Parse ----- # Ref format: L-DA007/2026 REF_RX = /\bL-DA\d+\/\d{4}\b/i doc = Nokogiri::HTML(html) saved = 0 doc.css("li.generic-list__item h3.generic-list__title a").each do |a| raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip next unless (m = raw_text.match(REF_RX)) ref = m[0].strip # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)" rest = raw_text.sub(ref, "").strip # Extract on-notice date: "(submissions by 21/04/2026)" on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || "" on_notice_to = Util.parse_aus_date(on_notice_to_raw) # Remove the "(submissions by ...)" clause rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip # Split "ADDRESS - DESCRIPTION" at first " - " if (split = rest.index(" - ")) address = rest[0, split].strip description = rest[(split + 3)..].strip else address = rest description = "Development Application" end next if address.empty? doc_url = a["href"].to_s.strip doc_url = nil if doc_url.empty? begin DB.upsert(TABLE, { council_reference: ref, address: address[0, 255], description: description, date_received: nil, date_received_raw: "", on_notice_to: on_notice_to, on_notice_to_raw: on_notice_to_raw, document_url: doc_url, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: ref, address: address ) Log.info "latrobe", "Upserted #{ref} -> #{address}" saved += 1 rescue StandardError => e Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}" end end puts "Done #{TABLE}. Saved #{saved} item(s)."