# scrapers/planbuild.rb require "date" require "json" require "nokogiri" require "uri" require "net/http" require "open-uri" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/geocode" require_relative "../lib/enrich" require "fileutils" TABLE = ENV.fetch("TABLE_NAME") BASE = "https://portal.planbuild.tas.gov.au" PAGE = "#{BASE}/external/advertisement/search" DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" DB.ensure_table!(TABLE) # --- cookie + csrf helpers --- def merge_set_cookie!(jar, res) (res.get_fields("set-cookie") || []).each do |raw| raw.split(/,(?=[^;]+?=)/).each do |c| if c =~ /\A\s*([^=;,\s]+)\s*=\s*([^;,\s]+)/ jar[$1] = $2 end end end end def cookie_header(jar) base = "accepted=1; disclaimerAccepted=true; insecureSiteWideBanner=1" more = jar.map { |k, v| "#{k}=#{v}" }.join("; ") [base, more].reject(&:empty?).join("; ") end # --- fetch list of advertisements --- def fetch_list jar = {} # 1) GET page to grab CSRF + SESSION res = Http.request(URI(PAGE), headers: { "Referer" => BASE }, jar: jar) merge_set_cookie!(jar, res) doc = Nokogiri::HTML(res.body) token = doc.at(%{meta[name="_csrf"]})&.[]("content") hdr = doc.at(%{meta[name="_csrf_header"]})&.[]("content") || "X-CSRF-TOKEN" raise "no CSRF token" unless token raise "no SESSION cookie" unless jar["SESSION"] # 2) POST listadvertisements uri = URI("#{BASE}/external/advertisement/search/listadvertisements") req = Net::HTTP::Post.new(uri) req["Content-Type"] = "application/json" req["X-Requested-With"] = "XMLHttpRequest" req["Origin"] = BASE req["Referer"] = PAGE req[hdr] = token req["Cookie"] = cookie_header(jar) req.body = { lgas: [] }.to_json res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) } js = JSON.parse(res.body) items = js.is_a?(Array) ? js : js["items"] [items, jar, token, hdr] end # --- fetch details --- def fetch_detail(uuid, jar, token, hdr) uri = URI("#{BASE}/external/advertisement/#{uuid}/get") req = Net::HTTP::Get.new(uri) req["X-Requested-With"] = "XMLHttpRequest" req["Referer"] = PAGE req[hdr] = token req["Cookie"] = cookie_header(jar) res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) } # decompress if gzip body = res['Content-Encoding'] == 'gzip' ? Zlib::GzipReader.new(StringIO.new(res.body)).read : res.body JSON.parse(body) rescue {} end puts "Fetching PlanBuild list…" items, jar, token, hdr = fetch_list puts "Found #{items.length} items for #{TABLE}" items.each do |r| ref = r["referenceNumber"] addr = r["addressString"] desc = r["description"] start = Util.parse_epoch_ms(r["startDate"]) fin = Util.parse_epoch_ms(r["endDate"]) uuid = r["uuid"] next if ref.to_s.strip.empty? || addr.to_s.strip.empty? # derive council code & table council_code = ref.split("-")[1].to_s.upcase # e.g. PLN-HOB-xxxx → HOB table = Util.ref_to_table(ref) council_name = Util.ref_to_folder(ref).downcase # use for file path DB.ensure_table!(table) # --- fetch detail page --- detail = {} begin detail = fetch_detail(uuid, jar, token, hdr) if uuid rescue => e warn "Detail fetch failed for #{ref}: #{e.class} #{e.message}" end puts "Council: #{table}" puts "DETAIL for #{ref}: keys=#{detail.keys}" if detail["attachments"]&.any? puts "Attachments: #{detail['attachments'].map { |a| "id=#{a['id']}, title=#{a['documentTitle']}" }}" else puts "Attachments: none" end # --- handle attachments --- # --- handle attachments --- saved_paths = [] if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any? dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_")) FileUtils.mkdir_p(dir) (detail["attachments"] || []).each do |att| att_id = att["id"] title = att["documentTitle"] pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}" path = File.join(dir, "#{title.gsub(/[^\w\-.]+/, '_')}.pdf") uri = URI(pdf_url) req = Net::HTTP::Get.new(uri) req["Cookie"] = cookie_header(jar) req["Referer"] = "#{BASE}/external/advertisement/#{uuid}" res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) } File.binwrite(path, res.body) saved_paths << path end # store first PDF relative path in DB if saved_paths.any? first_web_rel = saved_paths.first.sub(DOWNLOAD_DIR, "/files") DB.client.prepare("UPDATE `#{table}` SET local_document_url = ? WHERE council_reference = ?") .execute(first_web_rel, ref) end end # geocode geo = nil begin geo = Geocode.format_au(addr) rescue => e warn "Geocode error for #{ref}: #{e.class} #{e.message}" end # --- upsert into DB --- DB.upsert(table, { description: desc, date_received: start, date_received_raw: start&.strftime("%Y-%m-%d"), on_notice_to: fin, on_notice_to_raw: fin&.strftime("%Y-%m-%d"), address: addr[0,255], council_reference: ref[0,100], applicant: detail["applicant"], owner: detail["owner"], local_document_url: saved_paths.join(", ") }) enrich_after_upsert!( table: table, council_reference: ref, address: addr ) puts "Upserted #{ref} -> #{addr} into #{table}, PDFs: #{saved_paths.length}" end puts "Done #{TABLE}."