Ver Fonte

northern mid uploads

Benjamin Harris há 2 meses atrás
pai
commit
66860a3b0b
1 ficheiros alterados com 31 adições e 3 exclusões
  1. 31 3
      scrapers/northernmidlands.rb

+ 31 - 3
scrapers/northernmidlands.rb

@@ -11,16 +11,41 @@
 
 require "nokogiri"
 require "uri"
+require "fileutils"
 
 require_relative "../lib/scraper_helpers"
 require_relative "../lib/util"
 require_relative "../lib/log"
 
-TABLE = ENV.fetch("TABLE_NAME")
-URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
+TABLE                = ENV.fetch("TABLE_NAME")
+URL                  = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
+DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"
 
 DB.ensure_table!(TABLE)
 
+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
+
+def download_pdf(url, council_reference)
+  return nil if url.to_s.strip.empty?
+
+  dir = File.join(DOWNLOAD_DIR, "northernmidlands", safe_name(council_reference))
+  FileUtils.mkdir_p(dir)
+
+  fname = safe_name(File.basename(URI.parse(url).path))
+  fname = "document.pdf" if fname.empty?
+  path  = File.join(dir, fname)
+
+  body = Http.get(url)
+  File.binwrite(path, body)
+  puts "  saved #{fname} (#{body.bytesize} bytes)"
+
+  "/downloads/northernmidlands/#{safe_name(council_reference)}/#{fname}"
+rescue StandardError => e
+  Log.warn "northernmidlands", "Download failed for #{url}: #{e.class} #{e.message}"
+  nil
+end
+
 REF_RX = /\bPLN-\d{2}-\d{4}\b/i
 
 html = Http.get(URL)
@@ -81,6 +106,8 @@ end
 puts "Found #{items.length} item(s) for #{TABLE}"
 
 items.each do |r|
+  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
+
   upsert_and_enrich!(
     table: TABLE,
     row: {
@@ -94,7 +121,8 @@ items.each do |r|
       owner:             ""
     },
     extras: {
-      document_url: r[:document_url]
+      document_url:       r[:document_url],
+      local_document_url: local_url
     }
   )
 end