|
@@ -1,340 +1,136 @@
|
|
|
-# scrapers/dorset.rb
|
|
|
|
|
|
|
+# Dorset Council — Advertised Development Applications
|
|
|
|
|
+#
|
|
|
|
|
+# Source: https://www.dorset.tas.gov.au/online-development-application-enquiry
|
|
|
|
|
+#
|
|
|
|
|
+# Page structure — each application is a <p><a href="PDF_URL">text</a></p>:
|
|
|
|
|
+#
|
|
|
|
|
+# PLA/2026/22: Residential dwelling and carport addition - Chris Triebe
|
|
|
|
|
+# and Associates Town Planning Services - 13 Gladstone Road
|
|
|
|
|
+# Herrick - Closes 18.04.2026
|
|
|
|
|
+#
|
|
|
|
|
+# Text format: REF: DESCRIPTION - APPLICANT - ADDRESS - Closes DD.MM.YYYY
|
|
|
|
|
+#
|
|
|
|
|
+# Note: the old eServices portal (eservices.dorset.tas.gov.au) is still live
|
|
|
|
|
+# and was the previous data source. The council now publishes the advertised
|
|
|
|
|
+# list on their main website with direct PDF links, which is simpler to scrape.
|
|
|
|
|
+
|
|
|
require "date"
|
|
require "date"
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
require "uri"
|
|
|
require "fileutils"
|
|
require "fileutils"
|
|
|
|
|
|
|
|
-require_relative "../lib/enrich"
|
|
|
|
|
-require_relative "../lib/log"
|
|
|
|
|
|
|
+require_relative "../lib/scraper_helpers"
|
|
|
require_relative "../lib/util"
|
|
require_relative "../lib/util"
|
|
|
-TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
-BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
|
|
|
|
|
-BASE_HTTP = "http://eservices.dorset.tas.gov.au"
|
|
|
|
|
-
|
|
|
|
|
-# Pick one
|
|
|
|
|
-LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
|
|
|
|
|
-#LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
|
|
|
|
|
|
|
+require_relative "../lib/log"
|
|
|
|
|
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
+URL = "https://www.dorset.tas.gov.au/online-development-application-enquiry"
|
|
|
DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
|
|
DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
|
|
|
DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
|
|
DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
-def abs_url(href)
|
|
|
|
|
- return "" if href.to_s.strip.empty?
|
|
|
|
|
- URI.join(BASE_HTTPS, href).to_s
|
|
|
|
|
-rescue URI::InvalidURIError
|
|
|
|
|
- href.to_s
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def dorset_get(jar, url)
|
|
|
|
|
- [BASE_HTTPS, BASE_HTTP].each do |base|
|
|
|
|
|
- begin
|
|
|
|
|
- Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
|
|
|
|
|
- Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
|
|
|
|
|
- tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
|
|
|
|
|
- res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
|
|
|
|
|
- if res.is_a?(Net::HTTPRedirection) && res["location"]
|
|
|
|
|
- res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
|
|
|
|
|
- end
|
|
|
|
|
- return res if res.is_a?(Net::HTTPSuccess)
|
|
|
|
|
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
|
|
|
|
|
- next
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
- raise "Dorset fetch failed for #{url}"
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def parse_list(html)
|
|
|
|
|
- doc = Nokogiri::HTML(html)
|
|
|
|
|
- out = []
|
|
|
|
|
- doc.css("h4.non_table_headers a").each do |a|
|
|
|
|
|
- address = a.text.to_s.strip
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- entry = a.ancestors("h4").first&.next_element
|
|
|
|
|
-
|
|
|
|
|
- description = ""
|
|
|
|
|
- date_received_raw = ""
|
|
|
|
|
- council_reference = ""
|
|
|
|
|
- applicant = ""
|
|
|
|
|
- owner = ""
|
|
|
|
|
-
|
|
|
|
|
- if entry
|
|
|
|
|
- entry.css(".rowDataOnly").each do |p|
|
|
|
|
|
- spans = p.css("span")
|
|
|
|
|
- next unless spans.length == 2
|
|
|
|
|
- key = spans[0].text.to_s.strip
|
|
|
|
|
- val = spans[1].text.to_s.strip
|
|
|
|
|
- case key
|
|
|
|
|
- when "Type of Work" then description = val
|
|
|
|
|
- when "Date Lodged" then date_received_raw = val
|
|
|
|
|
- when "Application No." then council_reference = val
|
|
|
|
|
- when "Applicant" then applicant = val
|
|
|
|
|
- when "Owner" then owner = val
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- lodged_dt = Util.parse_aus_date(date_received_raw)
|
|
|
|
|
- on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
|
|
|
|
|
-
|
|
|
|
|
- out << {
|
|
|
|
|
- address: address,
|
|
|
|
|
- detail_href: href,
|
|
|
|
|
- description: description.empty? ? "Development Application" : description,
|
|
|
|
|
- date_received_raw: date_received_raw,
|
|
|
|
|
- date_received: lodged_dt,
|
|
|
|
|
- on_notice_to: on_to_dt,
|
|
|
|
|
- on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- applicant: applicant,
|
|
|
|
|
- owner: owner
|
|
|
|
|
- }
|
|
|
|
|
- end
|
|
|
|
|
- out
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def extract_doc_links(detail_html)
|
|
|
|
|
- doc = Nokogiri::HTML(detail_html)
|
|
|
|
|
- links = []
|
|
|
|
|
- t = doc.css('table[summary]').find { |tbl|
|
|
|
|
|
- tbl["summary"].to_s.downcase.include?("electronic document")
|
|
|
|
|
- }
|
|
|
|
|
- if t
|
|
|
|
|
- links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
|
|
|
|
|
- end
|
|
|
|
|
- links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
|
|
|
|
|
- links.map { |h| abs_url(h) }.uniq
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-# --- new: parse the tasks/milestones table ---
|
|
|
|
|
-def parse_tasks(detail_html)
|
|
|
|
|
- doc = Nokogiri::HTML(detail_html)
|
|
|
|
|
- t = doc.css('table[summary]').find { |tbl|
|
|
|
|
|
- tbl["summary"].to_s.downcase.include?("tasks associated")
|
|
|
|
|
- }
|
|
|
|
|
- return [] unless t
|
|
|
|
|
-
|
|
|
|
|
- out = []
|
|
|
|
|
- t.css("tr")[1..]&.each do |tr|
|
|
|
|
|
- tds = tr.css("td")
|
|
|
|
|
- next if tds.empty?
|
|
|
|
|
- stage_desc = tds[1]&.text.to_s.strip
|
|
|
|
|
- opened_raw = tds[2]&.text.to_s.strip
|
|
|
|
|
- target_raw = tds[3]&.text.to_s.strip
|
|
|
|
|
- completed_raw = tds[4]&.text.to_s.strip
|
|
|
|
|
- status = tds[5]&.text.to_s.strip
|
|
|
|
|
-
|
|
|
|
|
- out << {
|
|
|
|
|
- stage_description: stage_desc,
|
|
|
|
|
- opened_raw: opened_raw,
|
|
|
|
|
- opened_date: Util.parse_aus_date(opened_raw),
|
|
|
|
|
- target_raw: target_raw,
|
|
|
|
|
- target_date: Util.parse_aus_date(target_raw),
|
|
|
|
|
- completed_raw: completed_raw,
|
|
|
|
|
- completed_date: Util.parse_aus_date(completed_raw),
|
|
|
|
|
- status: status
|
|
|
|
|
- }
|
|
|
|
|
- end
|
|
|
|
|
- out
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def ensure_stages_table!(table)
|
|
|
|
|
- tn = "#{table}_stages"
|
|
|
|
|
- DB.client.query(<<~SQL)
|
|
|
|
|
- CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
|
|
|
|
|
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
|
|
|
|
|
- council_reference VARCHAR(100) NOT NULL,
|
|
|
|
|
- address VARCHAR(255) NOT NULL,
|
|
|
|
|
- stage_description VARCHAR(255) NOT NULL,
|
|
|
|
|
- opened_date DATE NULL,
|
|
|
|
|
- opened_raw VARCHAR(50) NULL,
|
|
|
|
|
- target_date DATE NULL,
|
|
|
|
|
- target_raw VARCHAR(50) NULL,
|
|
|
|
|
- completed_date DATE NULL,
|
|
|
|
|
- completed_raw VARCHAR(50) NULL,
|
|
|
|
|
- status VARCHAR(100) NULL,
|
|
|
|
|
- created_at DATETIME NOT NULL,
|
|
|
|
|
- updated_at DATETIME NOT NULL,
|
|
|
|
|
- PRIMARY KEY (id),
|
|
|
|
|
- UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
|
|
|
|
|
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
|
|
|
|
- SQL
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def save_stages(table, ref, addr, stages)
|
|
|
|
|
- return if stages.empty?
|
|
|
|
|
- tn = "#{table}_stages"
|
|
|
|
|
- ensure_stages_table!(table)
|
|
|
|
|
-
|
|
|
|
|
- DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
|
|
|
|
|
- .execute(ref, addr)
|
|
|
|
|
-
|
|
|
|
|
- ins = DB.client.prepare(<<~SQL)
|
|
|
|
|
- INSERT INTO `#{DB.client.escape(tn)}`
|
|
|
|
|
- (council_reference, address, stage_description,
|
|
|
|
|
- opened_date, opened_raw, target_date, target_raw,
|
|
|
|
|
- completed_date, completed_raw, status, created_at, updated_at)
|
|
|
|
|
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
|
|
|
|
|
- SQL
|
|
|
|
|
-
|
|
|
|
|
- stages.each do |s|
|
|
|
|
|
- ins.execute(
|
|
|
|
|
- ref, addr, s[:stage_description][0,255],
|
|
|
|
|
- s[:opened_date], s[:opened_raw][0,50],
|
|
|
|
|
- s[:target_date], s[:target_raw][0,50],
|
|
|
|
|
- s[:completed_date], s[:completed_raw][0,50],
|
|
|
|
|
- s[:status][0,100]
|
|
|
|
|
- )
|
|
|
|
|
- end
|
|
|
|
|
-end
|
|
|
|
|
|
|
+REF_RX = /\bPLA\/\d{4}\/\d+\b/i
|
|
|
|
|
+CLOSE_RX = /\bCloses\s+(\d{1,2}[.\-]\d{1,2}[.\-]\d{4})\b/i
|
|
|
|
|
|
|
|
def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
|
|
def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
|
|
|
|
|
|
|
|
-def id_from_url(u)
|
|
|
|
|
- uri = URI.parse(u)
|
|
|
|
|
- q = uri.query.to_s
|
|
|
|
|
- q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
|
|
|
|
|
-rescue URI::InvalidURIError
|
|
|
|
|
- nil
|
|
|
|
|
-end
|
|
|
|
|
|
|
+def download_pdf(url, council_reference)
|
|
|
|
|
+ return nil if url.to_s.strip.empty?
|
|
|
|
|
|
|
|
-def filename_from_response(res, fallback_id)
|
|
|
|
|
- cd = res["content-disposition"].to_s
|
|
|
|
|
- if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
|
|
|
|
|
- return safe_name($1)
|
|
|
|
|
- end
|
|
|
|
|
- base = safe_name(fallback_id || "document")
|
|
|
|
|
- ct = res["content-type"].to_s
|
|
|
|
|
- ext = ct.include?("pdf") ? ".pdf" : ".bin"
|
|
|
|
|
- "#{base}#{ext}"
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def download_all(urls, jar, council_reference)
|
|
|
|
|
- return [] if urls.empty?
|
|
|
|
|
dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
|
|
dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
|
|
|
FileUtils.mkdir_p(dir)
|
|
FileUtils.mkdir_p(dir)
|
|
|
- saved = []
|
|
|
|
|
- first_web_rel = nil
|
|
|
|
|
|
|
|
|
|
- urls.each_with_index do |u, i|
|
|
|
|
|
- begin
|
|
|
|
|
- res = dorset_get(jar, u)
|
|
|
|
|
- body = res.body.to_s
|
|
|
|
|
- fid = id_from_url(u) || "file#{i+1}"
|
|
|
|
|
- name = filename_from_response(res, fid)
|
|
|
|
|
- path = File.join(dir, name)
|
|
|
|
|
- bytes = File.binwrite(path, body)
|
|
|
|
|
- puts " saved #{path} (#{bytes} bytes)"
|
|
|
|
|
- saved << path
|
|
|
|
|
- first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Download failed for #{u}: #{e.class} #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ fname = safe_name(File.basename(URI.parse(url).path))
|
|
|
|
|
+ fname = "document.pdf" if fname.empty?
|
|
|
|
|
+ path = File.join(dir, fname)
|
|
|
|
|
|
|
|
- if first_web_rel
|
|
|
|
|
- begin
|
|
|
|
|
- DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
|
|
|
|
|
- .execute(first_web_rel, council_reference)
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
|
|
|
|
|
+ File.binwrite(path, body)
|
|
|
|
|
+ puts " saved #{fname} (#{body.bytesize} bytes)"
|
|
|
|
|
|
|
|
- saved
|
|
|
|
|
|
|
+ "/files/dorset/#{safe_name(council_reference)}/#{fname}"
|
|
|
|
|
+rescue StandardError => e
|
|
|
|
|
+ Log.warn "dorset", "Download failed for #{url}: #{e.class} #{e.message}"
|
|
|
|
|
+ nil
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-puts "Fetching Dorset list…"
|
|
|
|
|
-
|
|
|
|
|
-jar = {}
|
|
|
|
|
-list_res = dorset_get(jar, LIST_URL)
|
|
|
|
|
-list_html = list_res.body
|
|
|
|
|
-list_items = parse_list(list_html)
|
|
|
|
|
-
|
|
|
|
|
-puts "Found #{list_items.length} items for #{TABLE}"
|
|
|
|
|
-
|
|
|
|
|
-list_items.each do |r|
|
|
|
|
|
- next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
|
|
|
|
|
-
|
|
|
|
|
- detail_url = abs_url(r[:detail_href])
|
|
|
|
|
- doc_urls = []
|
|
|
|
|
- stages = []
|
|
|
|
|
- saved_paths = []
|
|
|
|
|
|
|
+html = Http.get(URL)
|
|
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
|
|
+items = []
|
|
|
|
|
|
|
|
- if !detail_url.empty?
|
|
|
|
|
- begin
|
|
|
|
|
- detail_res = dorset_get(jar, detail_url)
|
|
|
|
|
- detail_html = detail_res.body
|
|
|
|
|
|
|
+doc.css("p a[href]").each do |a|
|
|
|
|
|
+ text = a.text.gsub(/[[:space:]]+/, " ").strip
|
|
|
|
|
+ next unless (ref_m = text.match(REF_RX))
|
|
|
|
|
|
|
|
- # documents
|
|
|
|
|
- doc_urls = extract_doc_links(detail_html)
|
|
|
|
|
- saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
|
|
|
|
|
|
|
+ ref = ref_m[0]
|
|
|
|
|
|
|
|
- # stages
|
|
|
|
|
- stages = parse_tasks(detail_html)
|
|
|
|
|
|
|
+ # Strip "PLA/YYYY/NNN: " prefix
|
|
|
|
|
+ remainder = text.sub(/\A#{Regexp.escape(ref)}:\s*/i, "")
|
|
|
|
|
|
|
|
- # prefer Advertising/Public Notif dates if they exist
|
|
|
|
|
- if r[:on_notice_to].nil?
|
|
|
|
|
- adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
|
|
|
|
|
- if adv
|
|
|
|
|
- r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
|
|
|
|
|
- r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ # Extract and strip closing date from the end
|
|
|
|
|
+ close_raw = ""
|
|
|
|
|
+ on_notice_to = nil
|
|
|
|
|
+ if (close_m = remainder.match(CLOSE_RX))
|
|
|
|
|
+ close_raw = close_m[1]
|
|
|
|
|
+ on_notice_to = Date.strptime(close_raw, "%d.%m.%Y") rescue nil
|
|
|
|
|
+ remainder = remainder.sub(/\s*-\s*#{Regexp.escape(close_m[0])}\s*\z/i, "").strip
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
- representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
|
|
|
|
|
-
|
|
|
|
|
- # geocode
|
|
|
|
|
- geo = nil
|
|
|
|
|
- begin
|
|
|
|
|
- geo = Geocode.format_au(r[:address])
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
|
|
|
|
|
|
|
+ # Remaining text: "Description - Applicant - Address"
|
|
|
|
|
+ # Split on " - "; last part = address, second-to-last = applicant, rest = description
|
|
|
|
|
+ parts = remainder.split(/\s+-\s+/)
|
|
|
|
|
+ if parts.length >= 3
|
|
|
|
|
+ address = parts.last.strip
|
|
|
|
|
+ applicant = parts[-2].strip
|
|
|
|
|
+ description = parts[0..-3].join(" - ").strip
|
|
|
|
|
+ elsif parts.length == 2
|
|
|
|
|
+ address = parts.last.strip
|
|
|
|
|
+ applicant = ""
|
|
|
|
|
+ description = parts.first.strip
|
|
|
|
|
+ else
|
|
|
|
|
+ address = remainder.strip
|
|
|
|
|
+ applicant = ""
|
|
|
|
|
+ description = "Development Application"
|
|
|
end
|
|
end
|
|
|
-
|
|
|
|
|
- council_reference = r[:council_reference][0,100]
|
|
|
|
|
- address = r[:address][0,255]
|
|
|
|
|
|
|
|
|
|
- # upsert main row
|
|
|
|
|
- DB.upsert(TABLE, {
|
|
|
|
|
- description: r[:description],
|
|
|
|
|
- date_received: r[:date_received],
|
|
|
|
|
- date_received_raw: r[:date_received_raw],
|
|
|
|
|
- on_notice_to: r[:on_notice_to],
|
|
|
|
|
- on_notice_to_raw: r[:on_notice_to_raw],
|
|
|
|
|
- address: address,
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- applicant: r[:applicant],
|
|
|
|
|
- owner: r[:owner]
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
+ description = "Development Application" if description.empty?
|
|
|
|
|
|
|
|
- enrich_after_upsert!(
|
|
|
|
|
- table: TABLE,
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- address: address
|
|
|
|
|
-)
|
|
|
|
|
-
|
|
|
|
|
- tn = DB.client.escape(TABLE)
|
|
|
|
|
-sql = %Q{
|
|
|
|
|
- SELECT address_std, lat, lng
|
|
|
|
|
- FROM `#{tn}`
|
|
|
|
|
- WHERE council_reference = ? AND address = ?
|
|
|
|
|
- LIMIT 1
|
|
|
|
|
-}
|
|
|
|
|
-begin
|
|
|
|
|
- row = DB.client.prepare(sql).execute(council_reference, address).first
|
|
|
|
|
- puts " enriched -> #{row ? row.inspect : 'nil'}"
|
|
|
|
|
-rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", " enriched probe failed: #{e.class} #{e.message}"
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ pdf_url = abs_url(URL, a["href"].to_s.strip)
|
|
|
|
|
|
|
|
- puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
|
|
|
|
|
|
|
+ items << {
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address,
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ applicant: applicant,
|
|
|
|
|
+ on_notice_to: on_notice_to,
|
|
|
|
|
+ on_notice_to_raw: close_raw,
|
|
|
|
|
+ document_url: pdf_url
|
|
|
|
|
+ }
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
|
|
+puts "Found #{items.length} item(s) for #{TABLE}"
|
|
|
|
|
+
|
|
|
|
|
+items.each do |r|
|
|
|
|
|
+ local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
|
|
|
|
|
+
|
|
|
|
|
+ upsert_and_enrich!(
|
|
|
|
|
+ table: TABLE,
|
|
|
|
|
+ row: {
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ address: r[:address],
|
|
|
|
|
+ description: r[:description],
|
|
|
|
|
+ applicant: r[:applicant],
|
|
|
|
|
+ on_notice_to: r[:on_notice_to],
|
|
|
|
|
+ on_notice_to_raw: r[:on_notice_to_raw],
|
|
|
|
|
+ owner: ""
|
|
|
|
|
+ },
|
|
|
|
|
+ extras: {
|
|
|
|
|
+ document_url: r[:document_url],
|
|
|
|
|
+ local_document_url: local_url
|
|
|
|
|
+ }
|
|
|
|
|
+ )
|
|
|
|
|
+end
|
|
|
|
|
|
|
|
-puts "Done #{TABLE}."
|
|
|
|
|
|
|
+puts "Done #{TABLE}. Saved #{items.length} item(s)."
|