| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210 |
- # Latrobe Council — Planning Applications on Public Exhibition
- #
- # Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp
- #
- # Cloudflare is present — requires homepage warmup with browser-like headers
- # before the planning page responds (same technique as burnie.rb / kingisland.rb).
- #
- # Page structure:
- # <ul class="generic-list__list">
- # <li class="generic-list__item generic-list__file">
- # <h3 class="generic-list__title">
- # <a href="...pdf">L-DA007/2026 208 Gilbert Street, Latrobe - proposed
- # Additional Dwelling (submissions by 21/04/2026) <span>(PDF File, 2.0 MB)</span></a>
- # </h3>
- # </li>
- # </ul>
- require "date"
- require "nokogiri"
- require "net/http"
- require "uri"
- require_relative "../lib/db"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- require_relative "../lib/util"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_latrobe
- BASE_URL = "https://www.latrobe.tas.gov.au"
- URL = "#{BASE_URL}/services/building-and-planning-services/planningapp"
- DB.ensure_table!(TABLE)
- # ----- Browser-like headers (WAF/Cloudflare warmup) -----
- BASE_HEADERS = {
- "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language" => "en-AU,en;q=0.9",
- "Accept-Encoding" => "identity",
- "Upgrade-Insecure-Requests" => "1",
- "Sec-Fetch-Dest" => "document",
- "Sec-Fetch-Mode" => "navigate",
- "Sec-Fetch-Site" => "none",
- "Sec-Fetch-User" => "?1",
- "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
- "sec-ch-ua-mobile" => "?0",
- "sec-ch-ua-platform" => '"Windows"',
- "Connection" => "close",
- }.freeze
- class CookieJar
- def initialize; @h = {}; end
- def for(host)
- @h[host] || ""
- end
- def merge_from(resp, host)
- cookies = resp.get_fields("Set-Cookie") || []
- return if cookies.empty?
- existing = parse_header(@h[host])
- cookies.each do |sc|
- kv = sc.split(";", 2).first
- k, v = kv.split("=", 2)
- existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
- end
- @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
- end
- private
- def parse_header(s)
- s.to_s.split(";").map(&:strip).filter_map { |kv|
- k, v = kv.split("=", 2)
- [k, v] unless k.to_s.empty?
- }.to_h
- end
- end
- def http_get(url, jar:, referer: nil, fetch_site: "none")
- uri = URI(url)
- hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
- hdrs["Referer"] = referer if referer
- cookie = jar.for(uri.host)
- hdrs["Cookie"] = cookie unless cookie.empty?
- limit = 5
- code = 0
- body = ""
- while limit > 0
- limit -= 1
- redirect_to = nil
- req = Net::HTTP::Get.new(uri, hdrs)
- Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
- resp = http.request(req)
- jar.merge_from(resp, uri.host)
- code = resp.code.to_i
- if [301, 302, 303, 307, 308].include?(code) && resp["location"]
- redirect_to = URI.join(uri, resp["location"])
- else
- body = resp.body.to_s
- end
- end
- if redirect_to
- uri = redirect_to
- next
- end
- break
- end
- [code, body]
- rescue StandardError => e
- Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}"
- [0, ""]
- end
- # ----- Warmup then fetch -----
- jar = CookieJar.new
- Log.info "latrobe", "Warming up via homepage..."
- code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
- Log.info "latrobe", "Homepage: #{code0}"
- sleep(0.5)
- Log.info "latrobe", "Fetching planning page..."
- code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
- Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)"
- if code1 != 200 || html.bytesize < 1_000
- Log.warn "latrobe", "Could not fetch planning page (status #{code1})."
- puts "Done #{TABLE}. Saved 0 item(s)."
- exit 0
- end
- if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
- Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser."
- puts "Done #{TABLE}. Saved 0 item(s)."
- exit 0
- end
- # ----- Parse -----
- # Ref format: L-DA007/2026
- REF_RX = /\bL-DA\d+\/\d{4}\b/i
- doc = Nokogiri::HTML(html)
- saved = 0
- doc.css("li.generic-list__item h3.generic-list__title a").each do |a|
- raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip
- next unless (m = raw_text.match(REF_RX))
- ref = m[0].strip
- # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)"
- rest = raw_text.sub(ref, "").strip
- # Extract on-notice date: "(submissions by 21/04/2026)"
- on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || ""
- on_notice_to = Util.parse_aus_date(on_notice_to_raw)
- # Remove the "(submissions by ...)" clause
- rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip
- # Split "ADDRESS - DESCRIPTION" at first " - "
- if (split = rest.index(" - "))
- address = rest[0, split].strip
- description = rest[(split + 3)..].strip
- else
- address = rest
- description = "Development Application"
- end
- next if address.empty?
- doc_url = a["href"].to_s.strip
- doc_url = nil if doc_url.empty?
- begin
- DB.upsert(TABLE, {
- council_reference: ref,
- address: address[0, 255],
- description: description,
- date_received: nil,
- date_received_raw: "",
- on_notice_to: on_notice_to,
- on_notice_to_raw: on_notice_to_raw,
- document_url: doc_url,
- applicant: "",
- owner: ""
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: ref,
- address: address
- )
- Log.info "latrobe", "Upserted #{ref} -> #{address}"
- saved += 1
- rescue StandardError => e
- Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}"
- end
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|