Ver Fonte

Planbuild Update

Benjamin Harris há 2 meses atrás
pai
commit
c4d0f4d4b9
1 ficheiros alterados com 67 adições e 77 exclusões
  1. 67 77
      scrapers/planbuild.rb

+ 67 - 77
scrapers/planbuild.rb

@@ -4,13 +4,15 @@ require "json"
 require "nokogiri"
 require "uri"
 require "net/http"
-require "open-uri"
+require "zlib"
+require "stringio"
+require "fileutils"
 require_relative "../lib/http"
 require_relative "../lib/db"
+require_relative "../lib/log"
 require_relative "../lib/util"
 require_relative "../lib/geocode"
 require_relative "../lib/enrich"
-require "fileutils"
 
 TABLE = ENV.fetch("TABLE_NAME")
 BASE  = "https://portal.planbuild.tas.gov.au"
@@ -68,7 +70,7 @@ def fetch_list
     [items, jar, token, hdr]
 end
 
-# --- fetch details ---
+# --- fetch details — always returns a Hash ---
 def fetch_detail(uuid, jar, token, hdr)
     uri = URI("#{BASE}/external/advertisement/#{uuid}/get")
     req = Net::HTTP::Get.new(uri)
@@ -79,9 +81,13 @@ def fetch_detail(uuid, jar, token, hdr)
 
     res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
 
-    # decompress if gzip
-    body = res['Content-Encoding'] == 'gzip' ? Zlib::GzipReader.new(StringIO.new(res.body)).read : res.body
-    JSON.parse(body) rescue {}
+    body = res["Content-Encoding"] == "gzip" \
+        ? Zlib::GzipReader.new(StringIO.new(res.body)).read \
+        : res.body
+    parsed = JSON.parse(body)
+    parsed.is_a?(Hash) ? parsed : {}
+rescue JSON::ParserError, Zlib::Error
+    {}
 end
 
 
@@ -100,91 +106,75 @@ items.each do |r|
 
     next if ref.to_s.strip.empty? || addr.to_s.strip.empty?
 
-    # derive council code & table
-    council_code = ref.split("-")[1].to_s.upcase   # e.g. PLN-HOB-xxxx → HOB
-    table        = Util.ref_to_table(ref)
-    council_name = Util.ref_to_folder(ref).downcase # use for file path
-    DB.ensure_table!(table)
-
-    # --- fetch detail page ---
-    detail = {}
     begin
-        detail = fetch_detail(uuid, jar, token, hdr) if uuid
-    rescue StandardError => e
-        Log.warn "scraper", "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
-    end
+        # derive council table from reference number (e.g. PLN-HOB-xxxx)
+        table        = Util.ref_to_table(ref)
+        council_name = Util.ref_to_folder(ref).downcase
+        DB.ensure_table!(table)
+
+        # fetch detail
+        detail = {}
+        begin
+            detail = fetch_detail(uuid, jar, token, hdr) if uuid
+        rescue StandardError => e
+            Log.warn "planbuild", "Detail fetch failed for #{ref}: #{e.class} #{e.message}"
+        end
 
-    puts "Council: #{table}"
-    puts "DETAIL for #{ref}: keys=#{detail.keys}"
-    if detail["attachments"]&.any?
-        puts "Attachments: #{detail['attachments'].map { |a| "id=#{a['id']}, title=#{a['documentTitle']}" }}"
-    else
-        puts "Attachments: none"
-    end
+        Log.debug "planbuild", "#{ref} -> #{table}, detail keys: #{detail.keys.join(", ")}"
 
-    # --- handle attachments ---
-    # --- handle attachments ---
-    saved_paths = []
-    if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
-        dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
-        FileUtils.mkdir_p(dir)
+        # handle attachments
+        saved_paths = []
+        if DOWNLOAD_ATTACHMENTS && uuid && detail["attachments"]&.any?
+            dir = File.join(DOWNLOAD_DIR, council_name, ref.gsub(/[^0-9a-zA-Z_-]/, "_"))
+            FileUtils.mkdir_p(dir)
 
-        (detail["attachments"] || []).each do |att|
-            att_id = att["id"]
-            title  = att["documentTitle"]
+            (detail["attachments"] || []).each do |att|
+                att_id = att["id"]
+                title  = att["documentTitle"].to_s.gsub(/[^\w\-.]+/, "_")
 
-            pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
-            path    = File.join(dir, "#{title.gsub(/[^\w\-.]+/, '_')}.pdf")
+                pdf_url = "#{BASE}/external/advertisement/#{uuid}/attachment/#{att_id}"
+                path    = File.join(dir, "#{title}.pdf")
 
-            uri = URI(pdf_url)
-            req = Net::HTTP::Get.new(uri)
-            req["Cookie"]  = cookie_header(jar)
-            req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
+                att_uri = URI(pdf_url)
+                att_req = Net::HTTP::Get.new(att_uri)
+                att_req["Cookie"]  = cookie_header(jar)
+                att_req["Referer"] = "#{BASE}/external/advertisement/#{uuid}"
 
-            res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) { |h| h.request(req) }
-            File.binwrite(path, res.body)
-            saved_paths << path
+                att_res = Net::HTTP.start(att_uri.host, att_uri.port, use_ssl: true) { |h| h.request(att_req) }
+                File.binwrite(path, att_res.body)
+                saved_paths << path
+            rescue StandardError => e
+                Log.warn "planbuild", "Attachment download failed for #{ref} att #{att["id"]}: #{e.class} #{e.message}"
+            end
         end
 
-        # store first PDF relative path in DB
-        if saved_paths.any?
-            first_web_rel = saved_paths.first.sub(DOWNLOAD_DIR, "/files")
-            DB.client.prepare("UPDATE `#{table}` SET local_document_url = ? WHERE council_reference = ?")
-            .execute(first_web_rel, ref)
-        end
-    end
+        local_url = saved_paths.empty? ? nil : saved_paths.first.sub(DOWNLOAD_DIR, "/files")
+
+        # upsert
+        DB.upsert(table, {
+            description:        desc,
+            date_received:      start,
+            date_received_raw:  start&.strftime("%Y-%m-%d"),
+            on_notice_to:       fin,
+            on_notice_to_raw:   fin&.strftime("%Y-%m-%d"),
+            address:            addr[0, 255],
+            council_reference:  ref[0, 100],
+            applicant:          detail["applicant"],
+            owner:              detail["owner"],
+            local_document_url: local_url
+        })
 
+        enrich_after_upsert!(
+            table:             table,
+            council_reference: ref,
+            address:           addr
+        )
 
+        Log.info "planbuild", "Upserted #{ref} -> #{addr} into #{table} (PDFs: #{saved_paths.length})"
 
-    # geocode
-    geo = nil
-    begin
-        geo = Geocode.format_au(addr)
     rescue StandardError => e
-        Log.warn "scraper", "Geocode error for #{ref}: #{e.class} #{e.message}"
+        Log.warn "planbuild", "Skipping #{ref}: #{e.class} #{e.message}"
     end
-
-    # --- upsert into DB ---
-    DB.upsert(table, {
-        description:       desc,
-        date_received:     start,
-        date_received_raw: start&.strftime("%Y-%m-%d"),
-        on_notice_to:      fin,
-        on_notice_to_raw:  fin&.strftime("%Y-%m-%d"),
-        address:           addr[0,255],
-        council_reference: ref[0,100],
-        applicant:         detail["applicant"],
-        owner:             detail["owner"],
-        local_document_url:      saved_paths.join(", ")
-        })
-
-    enrich_after_upsert!(
-        table: table,
-        council_reference: ref,
-        address: addr
-        )
-
-    puts "Upserted #{ref} -> #{addr} into #{table}, PDFs: #{saved_paths.length}"
 end
 
 puts "Done #{TABLE}."