| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190 |
- # scrapers/planbuild.rb
- require "date"
- require "json"
- require "nokogiri"
- require "uri"
- require "net/http"
- require "open-uri"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/geocode"
- require_relative "../lib/enrich"
- require "fileutils"
- TABLE = ENV.fetch("TABLE_NAME")
- BASE = "https://portal.planbuild.tas.gov.au"
- PAGE = "#{BASE}/external/advertisement/search"
- DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- DB.ensure_table!(TABLE)
- # --- cookie + csrf helpers ---
- def merge_set_cookie!(jar, res)
- (res.get_fields("set-cookie") || []).each do |raw|
- raw.split(/,(?=[^;]+?=)/).each do |c|
- if c =~ /\A\s*([^=;,\s]+)\s*=\s*([^;,\s]+)/
- jar[$1] = $2
- end
- end
- end
- end
- def cookie_header(jar)
- base = "accepted=1; disclaimerAccepted=true; insecureSiteWideBanner=1"
- more = jar.map { |k, v| "#{k}=#{v}" }.join("; ")
- [base, more].reject(&:empty?).join("; ")
- end
- # --- fetch list of advertisements ---
- def fetch_list
- jar = {}
- # 1) GET page to grab CSRF + SESSION
- res = Http.request(URI(PAGE), headers: { "Referer" => BASE }, jar: jar)
- merge_set_cookie!(jar, res)
- doc = Nokogiri::HTML(res.body)
- token = doc.at(%{meta[name="_csrf"]})&.[]("content")
- hdr = doc.at(%{meta[name="_csrf_header"]})&.[]("content") || "X-CSRF-TOKEN"
- raise "no CSRF token" unless token
- raise "no SESSION cookie" unless jar["SESSION"]
- # 2) POST listadvertisements
- uri = URI("#{BASE}/external/advertisement/search/listadvertisements")
- req = Net::HTTP::Post.new(uri)
- req["Content-Type"] = "application/json"
- req["X-Requested-With"] = "XMLHttpRequest"
- req["Origin"] = BASE
- req["Referer"] = PAGE
- req[hdr] = token
- req["Cookie"] = cookie_header(jar)
- req.body = { lgas: [] }.to_json
- res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
- js = JSON.parse(res.body)
- items = js.is_a?(Array) ? js : js["items"]
- [items, jar, token, hdr]
- end
- # --- fetch details ---
- def fetch_detail(uuid, jar, token, hdr)
- uri = URI("#{BASE}/external/advertisement/#{uuid}/get")
- req = Net::HTTP::Get.new(uri)
- req["X-Requested-With"] = "XMLHttpRequest"
- req["Referer"] = PAGE
- req[hdr] = token
- req["Cookie"] = cookie_header(jar)
- res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
- # decompress if gzip
- body = res['Content-Encoding'] == 'gzip' ? Zlib::GzipReader.new(StringIO.new(res.body)).read : res.body
- JSON.parse(body) rescue {}
- end
- puts "Fetching PlanBuild list…"
- items, jar, token, hdr = fetch_list
- puts "Found #{items.length} items for #{TABLE}"
- items.each do |r|
- ref = r["referenceNumber"]
- addr = r["addressString"]
- desc = r["description"]
- start = Util.parse_epoch_ms(r["startDate"])
- fin = Util.parse_epoch_ms(r["endDate"])
- uuid = r["uuid"]
- next if ref.to_s.strip.empty? || addr.to_s.strip.empty?
- # derive council code & table
- council_code = ref.split("-")[1].to_s.upcase # e.g. PLN-HOB-xxxx → HOB
- table = Util.ref_to_table(ref)
- council_name = Util.ref_to_folder(ref).downcase # use for file path
- DB.ensure_table!(table)
- # --- fetch detail page ---
- detail = {}
- begin
- detail = fetch_detail(uuid, jar, token, hdr) if uuid
- rescue => e
- warn "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
- end
- puts "Council: #{table}"
- puts "DETAIL for #{ref}: keys=#{detail.keys}"
- if detail["attachments"]&.any?
- puts "Attachments: #{detail['attachments'].map { |a| "id=#{a['id']}, title=#{a['documentTitle']}" }}"
- else
- puts "Attachments: none"
- end
- # --- handle attachments ---
- # --- handle attachments ---
- saved_paths = []
- if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
- dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
- FileUtils.mkdir_p(dir)
- (detail["attachments"] || []).each do |att|
- att_id = att["id"]
- title = att["documentTitle"]
- pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
- path = File.join(dir, "#{title.gsub(/[^\w\-.]+/, '_')}.pdf")
- uri = URI(pdf_url)
- req = Net::HTTP::Get.new(uri)
- req["Cookie"] = cookie_header(jar)
- req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
- res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
- File.binwrite(path, res.body)
- saved_paths << path
- end
- # store first PDF relative path in DB
- if saved_paths.any?
- first_web_rel = saved_paths.first.sub(DOWNLOAD_DIR, "/files")
- DB.client.prepare("UPDATE `#{table}` SET local_document_url = ? WHERE council_reference = ?")
- .execute(first_web_rel, ref)
- end
- end
- # geocode
- geo = nil
- begin
- geo = Geocode.format_au(addr)
- rescue => e
- warn "Geocode error for #{ref}: #{e.class} #{e.message}"
- end
- # --- upsert into DB ---
- DB.upsert(table, {
- description: desc,
- date_received: start,
- date_received_raw: start&.strftime("%Y-%m-%d"),
- on_notice_to: fin,
- on_notice_to_raw: fin&.strftime("%Y-%m-%d"),
- address: addr[0,255],
- council_reference: ref[0,100],
- applicant: detail["applicant"],
- owner: detail["owner"],
- local_document_url: saved_paths.join(", ")
- })
- enrich_after_upsert!(
- table: table,
- council_reference: ref,
- address: addr
- )
- puts "Upserted #{ref} -> #{addr} into #{table}, PDFs: #{saved_paths.length}"
- end
- puts "Done #{TABLE}."
|