|
|
@@ -4,13 +4,15 @@ require "json"
|
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
|
require "net/http"
|
|
|
-require "open-uri"
|
|
|
+require "zlib"
|
|
|
+require "stringio"
|
|
|
+require "fileutils"
|
|
|
require_relative "../lib/http"
|
|
|
require_relative "../lib/db"
|
|
|
+require_relative "../lib/log"
|
|
|
require_relative "../lib/util"
|
|
|
require_relative "../lib/geocode"
|
|
|
require_relative "../lib/enrich"
|
|
|
-require "fileutils"
|
|
|
|
|
|
TABLE = ENV.fetch("TABLE_NAME")
|
|
|
BASE = "https://portal.planbuild.tas.gov.au"
|
|
|
@@ -68,7 +70,7 @@ def fetch_list
|
|
|
[items, jar, token, hdr]
|
|
|
end
|
|
|
|
|
|
-# --- fetch details ---
|
|
|
+# --- fetch details — always returns a Hash ---
|
|
|
def fetch_detail(uuid, jar, token, hdr)
|
|
|
uri = URI("#{BASE}/external/advertisement/#{uuid}/get")
|
|
|
req = Net::HTTP::Get.new(uri)
|
|
|
@@ -79,9 +81,13 @@ def fetch_detail(uuid, jar, token, hdr)
|
|
|
|
|
|
res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
|
|
|
|
|
|
- # decompress if gzip
|
|
|
- body = res['Content-Encoding'] == 'gzip' ? Zlib::GzipReader.new(StringIO.new(res.body)).read : res.body
|
|
|
- JSON.parse(body) rescue {}
|
|
|
+ body = res["Content-Encoding"] == "gzip" \
|
|
|
+ ? Zlib::GzipReader.new(StringIO.new(res.body)).read \
|
|
|
+ : res.body
|
|
|
+ parsed = JSON.parse(body)
|
|
|
+ parsed.is_a?(Hash) ? parsed : {}
|
|
|
+rescue JSON::ParserError, Zlib::Error
|
|
|
+ {}
|
|
|
end
|
|
|
|
|
|
|
|
|
@@ -100,91 +106,75 @@ items.each do |r|
|
|
|
|
|
|
next if ref.to_s.strip.empty? || addr.to_s.strip.empty?
|
|
|
|
|
|
- # derive council code & table
|
|
|
- council_code = ref.split("-")[1].to_s.upcase # e.g. PLN-HOB-xxxx → HOB
|
|
|
- table = Util.ref_to_table(ref)
|
|
|
- council_name = Util.ref_to_folder(ref).downcase # use for file path
|
|
|
- DB.ensure_table!(table)
|
|
|
-
|
|
|
- # --- fetch detail page ---
|
|
|
- detail = {}
|
|
|
begin
|
|
|
- detail = fetch_detail(uuid, jar, token, hdr) if uuid
|
|
|
- rescue StandardError => e
|
|
|
- Log.warn "scraper", "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
|
|
|
- end
|
|
|
+ # derive council table from reference number (e.g. PLN-HOB-xxxx)
|
|
|
+ table = Util.ref_to_table(ref)
|
|
|
+ council_name = Util.ref_to_folder(ref).downcase
|
|
|
+ DB.ensure_table!(table)
|
|
|
+
|
|
|
+ # fetch detail
|
|
|
+ detail = {}
|
|
|
+ begin
|
|
|
+ detail = fetch_detail(uuid, jar, token, hdr) if uuid
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "planbuild", "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
|
|
|
+ end
|
|
|
|
|
|
- puts "Council: #{table}"
|
|
|
- puts "DETAIL for #{ref}: keys=#{detail.keys}"
|
|
|
- if detail["attachments"]&.any?
|
|
|
- puts "Attachments: #{detail['attachments'].map { |a| "id=#{a['id']}, title=#{a['documentTitle']}" }}"
|
|
|
- else
|
|
|
- puts "Attachments: none"
|
|
|
- end
|
|
|
+ Log.debug "planbuild", "#{ref} -> #{table}, detail keys: #{detail.keys.join(", ")}"
|
|
|
|
|
|
- # --- handle attachments ---
|
|
|
- # --- handle attachments ---
|
|
|
- saved_paths = []
|
|
|
- if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
|
|
|
- dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
|
|
|
- FileUtils.mkdir_p(dir)
|
|
|
+ # handle attachments
|
|
|
+ saved_paths = []
|
|
|
+ if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
|
|
|
+ dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
|
|
|
+ FileUtils.mkdir_p(dir)
|
|
|
|
|
|
- (detail["attachments"] || []).each do |att|
|
|
|
- att_id = att["id"]
|
|
|
- title = att["documentTitle"]
|
|
|
+ (detail["attachments"] || []).each do |att|
|
|
|
+ att_id = att["id"]
|
|
|
+ title = att["documentTitle"].to_s.gsub(/[^\w\-.]+/, "_")
|
|
|
|
|
|
- pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
|
|
|
- path = File.join(dir, "#{title.gsub(/[^\w\-.]+/, '_')}.pdf")
|
|
|
+ pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
|
|
|
+ path = File.join(dir, "#{title}.pdf")
|
|
|
|
|
|
- uri = URI(pdf_url)
|
|
|
- req = Net::HTTP::Get.new(uri)
|
|
|
- req["Cookie"] = cookie_header(jar)
|
|
|
- req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
|
|
|
+ att_uri = URI(pdf_url)
|
|
|
+ att_req = Net::HTTP::Get.new(att_uri)
|
|
|
+ att_req["Cookie"] = cookie_header(jar)
|
|
|
+ att_req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
|
|
|
|
|
|
- res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
|
|
|
- File.binwrite(path, res.body)
|
|
|
- saved_paths << path
|
|
|
+ att_res = Net::HTTP.start(att_uri.host, att_uri.port, use_ssl: true) { |h| h.request(att_req) }
|
|
|
+ File.binwrite(path, att_res.body)
|
|
|
+ saved_paths << path
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "planbuild", "Attachment download failed for #{ref} att #{att["id"]}: #{e.class} #{e.message}"
|
|
|
+ end
|
|
|
end
|
|
|
|
|
|
- # store first PDF relative path in DB
|
|
|
- if saved_paths.any?
|
|
|
- first_web_rel = saved_paths.first.sub(DOWNLOAD_DIR, "/files")
|
|
|
- DB.client.prepare("UPDATE `#{table}` SET local_document_url = ? WHERE council_reference = ?")
|
|
|
- .execute(first_web_rel, ref)
|
|
|
- end
|
|
|
- end
|
|
|
+ local_url = saved_paths.empty? ? nil : saved_paths.first.sub(DOWNLOAD_DIR, "/files")
|
|
|
+
|
|
|
+ # upsert
|
|
|
+ DB.upsert(table, {
|
|
|
+ description: desc,
|
|
|
+ date_received: start,
|
|
|
+ date_received_raw: start&.strftime("%Y-%m-%d"),
|
|
|
+ on_notice_to: fin,
|
|
|
+ on_notice_to_raw: fin&.strftime("%Y-%m-%d"),
|
|
|
+ address: addr[0, 255],
|
|
|
+ council_reference: ref[0, 100],
|
|
|
+ applicant: detail["applicant"],
|
|
|
+ owner: detail["owner"],
|
|
|
+ local_document_url: local_url
|
|
|
+ })
|
|
|
|
|
|
+ enrich_after_upsert!(
|
|
|
+ table: table,
|
|
|
+ council_reference: ref,
|
|
|
+ address: addr
|
|
|
+ )
|
|
|
|
|
|
+ Log.info "planbuild", "Upserted #{ref} -> #{addr} into #{table} (PDFs: #{saved_paths.length})"
|
|
|
|
|
|
- # geocode
|
|
|
- geo = nil
|
|
|
- begin
|
|
|
- geo = Geocode.format_au(addr)
|
|
|
rescue StandardError => e
|
|
|
- Log.warn "scraper", "Geocode error for #{ref}: #{e.class} #{e.message}"
|
|
|
+ Log.warn "planbuild", "Skipping #{ref}: #{e.class} #{e.message}"
|
|
|
end
|
|
|
-
|
|
|
- # --- upsert into DB ---
|
|
|
- DB.upsert(table, {
|
|
|
- description: desc,
|
|
|
- date_received: start,
|
|
|
- date_received_raw: start&.strftime("%Y-%m-%d"),
|
|
|
- on_notice_to: fin,
|
|
|
- on_notice_to_raw: fin&.strftime("%Y-%m-%d"),
|
|
|
- address: addr[0,255],
|
|
|
- council_reference: ref[0,100],
|
|
|
- applicant: detail["applicant"],
|
|
|
- owner: detail["owner"],
|
|
|
- local_document_url: saved_paths.join(", ")
|
|
|
- })
|
|
|
-
|
|
|
- enrich_after_upsert!(
|
|
|
- table: table,
|
|
|
- council_reference: ref,
|
|
|
- address: addr
|
|
|
- )
|
|
|
-
|
|
|
- puts "Upserted #{ref} -> #{addr} into #{table}, PDFs: #{saved_paths.length}"
|
|
|
end
|
|
|
|
|
|
puts "Done #{TABLE}."
|