|
|
@@ -2,26 +2,14 @@
|
|
|
|
|
|
require "nokogiri"
|
|
|
require_relative "../lib/http"
|
|
|
-require_relative "../lib/db"
|
|
|
require_relative "../lib/util"
|
|
|
-require_relative "../lib/enrich"
|
|
|
+require_relative "../lib/scraper_helpers"
|
|
|
|
|
|
TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_glamorgan
|
|
|
URL = "https://gsbc.tas.gov.au/services-facilities/public-notices/"
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
-# Optional column to store the PDF or document link
|
|
|
-begin
|
|
|
- DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL")
|
|
|
-rescue => e
|
|
|
- warn "document_url add skipped: #{e.class} #{e.message}"
|
|
|
-end
|
|
|
-
|
|
|
-def text_or(node, default = "")
|
|
|
- node ? node.text.strip : default
|
|
|
-end
|
|
|
-
|
|
|
# Try to extract a reference from visible text or file names
|
|
|
REF_RX = %r{(DA|PLA|APP|APPLICATION)\s*([0-9]{4})\s*/\s*([A-Za-z0-9\-_.]+)}i
|
|
|
def extract_reference(str)
|
|
|
@@ -32,13 +20,6 @@ def extract_reference(str)
|
|
|
nil
|
|
|
end
|
|
|
|
|
|
-def safe_abs(base, href)
|
|
|
- return "" if href.to_s.strip.empty?
|
|
|
- URI.join(base, href).to_s
|
|
|
-rescue URI::InvalidURIError
|
|
|
- href.to_s
|
|
|
-end
|
|
|
-
|
|
|
html = Http.get(URL)
|
|
|
doc = Nokogiri::HTML(html)
|
|
|
|
|
|
@@ -70,7 +51,7 @@ rows.each_with_index do |row, idx|
|
|
|
application_raw = text_or(tds[2])
|
|
|
on_notice_to_raw = text_or(tds[3])
|
|
|
link_el = tds[4]&.at_css("a")
|
|
|
- document_url = link_el ? safe_abs(URL, link_el["href"]) : ""
|
|
|
+ document_url = link_el ? abs_url(URL, link_el["href"]) : ""
|
|
|
|
|
|
date_received = Util.parse_aus_date(application_raw)
|
|
|
on_notice_to = Util.parse_aus_date(on_notice_to_raw)
|
|
|
@@ -87,30 +68,19 @@ rows.each_with_index do |row, idx|
|
|
|
|
|
|
next if address.empty? || council_reference.empty?
|
|
|
|
|
|
- DB.upsert(TABLE, {
|
|
|
- description: description,
|
|
|
- date_received: date_received,
|
|
|
- date_received_raw: application_raw,
|
|
|
- address: address,
|
|
|
- council_reference: council_reference,
|
|
|
- applicant: "",
|
|
|
- owner: ""
|
|
|
- })
|
|
|
-
|
|
|
- enrich_after_upsert!(
|
|
|
+ upsert_and_enrich!(
|
|
|
table: TABLE,
|
|
|
- council_reference: council_reference,
|
|
|
- address: address
|
|
|
+ row: {
|
|
|
+ description: description,
|
|
|
+ date_received: date_received,
|
|
|
+ date_received_raw: application_raw,
|
|
|
+ address: address,
|
|
|
+ council_reference: council_reference,
|
|
|
+ applicant: "",
|
|
|
+ owner: ""
|
|
|
+ },
|
|
|
+ extras: { document_url: document_url }
|
|
|
)
|
|
|
-
|
|
|
- begin
|
|
|
- upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
|
|
|
- upd.execute(document_url, council_reference, address)
|
|
|
- rescue Mysql2::Error => e
|
|
|
- warn "[glamorgan] db update skipped for #{council_reference}: #{e.message}"
|
|
|
- end
|
|
|
-
|
|
|
- puts "Upserted #{council_reference} -> #{address}"
|
|
|
saved += 1
|
|
|
end
|
|
|
|