|
@@ -11,16 +11,41 @@
|
|
|
|
|
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
require "uri"
|
|
|
|
|
+require "fileutils"
|
|
|
|
|
|
|
|
require_relative "../lib/scraper_helpers"
|
|
require_relative "../lib/scraper_helpers"
|
|
|
require_relative "../lib/util"
|
|
require_relative "../lib/util"
|
|
|
require_relative "../lib/log"
|
|
require_relative "../lib/log"
|
|
|
|
|
|
|
|
-TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
-URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
|
|
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
+URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
|
|
|
|
|
+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
|
|
|
|
|
+DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
|
|
+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
|
|
|
|
|
+
|
|
|
|
|
+def download_pdf(url, council_reference)
|
|
|
|
|
+ return nil if url.to_s.strip.empty?
|
|
|
|
|
+
|
|
|
|
|
+ dir = File.join(DOWNLOAD_DIR, "northernmidlands", safe_name(council_reference))
|
|
|
|
|
+ FileUtils.mkdir_p(dir)
|
|
|
|
|
+
|
|
|
|
|
+ fname = safe_name(File.basename(URI.parse(url).path))
|
|
|
|
|
+ fname = "document.pdf" if fname.empty?
|
|
|
|
|
+ path = File.join(dir, fname)
|
|
|
|
|
+
|
|
|
|
|
+ body = Http.get(url)
|
|
|
|
|
+ File.binwrite(path, body)
|
|
|
|
|
+ puts " saved #{fname} (#{body.bytesize} bytes)"
|
|
|
|
|
+
|
|
|
|
|
+ "/downloads/northernmidlands/#{safe_name(council_reference)}/#{fname}"
|
|
|
|
|
+rescue StandardError => e
|
|
|
|
|
+ Log.warn "northernmidlands", "Download failed for #{url}: #{e.class} #{e.message}"
|
|
|
|
|
+ nil
|
|
|
|
|
+end
|
|
|
|
|
+
|
|
|
REF_RX = /\bPLN-\d{2}-\d{4}\b/i
|
|
REF_RX = /\bPLN-\d{2}-\d{4}\b/i
|
|
|
|
|
|
|
|
html = Http.get(URL)
|
|
html = Http.get(URL)
|
|
@@ -81,6 +106,8 @@ end
|
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
|
|
|
|
|
|
items.each do |r|
|
|
items.each do |r|
|
|
|
|
|
+ local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
|
|
|
|
|
+
|
|
|
upsert_and_enrich!(
|
|
upsert_and_enrich!(
|
|
|
table: TABLE,
|
|
table: TABLE,
|
|
|
row: {
|
|
row: {
|
|
@@ -94,7 +121,8 @@ items.each do |r|
|
|
|
owner: ""
|
|
owner: ""
|
|
|
},
|
|
},
|
|
|
extras: {
|
|
extras: {
|
|
|
- document_url: r[:document_url]
|
|
|
|
|
|
|
+ document_url: r[:document_url],
|
|
|
|
|
+ local_document_url: local_url
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
end
|
|
end
|