| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342 |
- # scrapers/dorset.rb
- require "date"
- require "nokogiri"
- require "uri"
- require "fileutils"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/geocode"
- require_relative "../lib/enrich"
- TABLE = ENV.fetch("TABLE_NAME")
- BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
- BASE_HTTP = "http://eservices.dorset.tas.gov.au"
- # Pick one
- LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
- #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
- DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- DB.ensure_table!(TABLE)
- def abs_url(href)
- return "" if href.to_s.strip.empty?
- URI.join(BASE_HTTPS, href).to_s
- rescue URI::InvalidURIError
- href.to_s
- end
- def dorset_get(jar, url)
- [BASE_HTTPS, BASE_HTTP].each do |base|
- begin
- Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
- Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
- tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
- res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
- if res.is_a?(Net::HTTPRedirection) && res["location"]
- res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
- end
- return res if res.is_a?(Net::HTTPSuccess)
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
- next
- end
- end
- raise "Dorset fetch failed for #{url}"
- end
- def parse_list(html)
- doc = Nokogiri::HTML(html)
- out = []
- doc.css("h4.non_table_headers a").each do |a|
- address = a.text.to_s.strip
- href = a["href"].to_s
- entry = a.ancestors("h4").first&.next_element
- description = ""
- date_received_raw = ""
- council_reference = ""
- applicant = ""
- owner = ""
- if entry
- entry.css(".rowDataOnly").each do |p|
- spans = p.css("span")
- next unless spans.length == 2
- key = spans[0].text.to_s.strip
- val = spans[1].text.to_s.strip
- case key
- when "Type of Work" then description = val
- when "Date Lodged" then date_received_raw = val
- when "Application No." then council_reference = val
- when "Applicant" then applicant = val
- when "Owner" then owner = val
- end
- end
- end
- lodged_dt = Util.parse_aus_date(date_received_raw)
- on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
- out << {
- address: address,
- detail_href: href,
- description: description.empty? ? "Development Application" : description,
- date_received_raw: date_received_raw,
- date_received: lodged_dt,
- on_notice_to: on_to_dt,
- on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
- council_reference: council_reference,
- applicant: applicant,
- owner: owner
- }
- end
- out
- end
- def extract_doc_links(detail_html)
- doc = Nokogiri::HTML(detail_html)
- links = []
- t = doc.css('table[summary]').find { |tbl|
- tbl["summary"].to_s.downcase.include?("electronic document")
- }
- if t
- links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
- end
- links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
- links.map { |h| abs_url(h) }.uniq
- end
- # --- new: parse the tasks/milestones table ---
- def parse_tasks(detail_html)
- doc = Nokogiri::HTML(detail_html)
- t = doc.css('table[summary]').find { |tbl|
- tbl["summary"].to_s.downcase.include?("tasks associated")
- }
- return [] unless t
- out = []
- t.css("tr")[1..]&.each do |tr|
- tds = tr.css("td")
- next if tds.empty?
- stage_desc = tds[1]&.text.to_s.strip
- opened_raw = tds[2]&.text.to_s.strip
- target_raw = tds[3]&.text.to_s.strip
- completed_raw = tds[4]&.text.to_s.strip
- status = tds[5]&.text.to_s.strip
- out << {
- stage_description: stage_desc,
- opened_raw: opened_raw,
- opened_date: Util.parse_aus_date(opened_raw),
- target_raw: target_raw,
- target_date: Util.parse_aus_date(target_raw),
- completed_raw: completed_raw,
- completed_date: Util.parse_aus_date(completed_raw),
- status: status
- }
- end
- out
- end
- def ensure_stages_table!(table)
- tn = "#{table}_stages"
- DB.client.query(<<~SQL)
- CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
- council_reference VARCHAR(100) NOT NULL,
- address VARCHAR(255) NOT NULL,
- stage_description VARCHAR(255) NOT NULL,
- opened_date DATE NULL,
- opened_raw VARCHAR(50) NULL,
- target_date DATE NULL,
- target_raw VARCHAR(50) NULL,
- completed_date DATE NULL,
- completed_raw VARCHAR(50) NULL,
- status VARCHAR(100) NULL,
- created_at DATETIME NOT NULL,
- updated_at DATETIME NOT NULL,
- PRIMARY KEY (id),
- UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
- SQL
- end
- def save_stages(table, ref, addr, stages)
- return if stages.empty?
- tn = "#{table}_stages"
- ensure_stages_table!(table)
- DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
- .execute(ref, addr)
- ins = DB.client.prepare(<<~SQL)
- INSERT INTO `#{DB.client.escape(tn)}`
- (council_reference, address, stage_description,
- opened_date, opened_raw, target_date, target_raw,
- completed_date, completed_raw, status, created_at, updated_at)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
- SQL
- stages.each do |s|
- ins.execute(
- ref, addr, s[:stage_description][0,255],
- s[:opened_date], s[:opened_raw][0,50],
- s[:target_date], s[:target_raw][0,50],
- s[:completed_date], s[:completed_raw][0,50],
- s[:status][0,100]
- )
- end
- end
- def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
- def id_from_url(u)
- uri = URI.parse(u)
- q = uri.query.to_s
- q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
- rescue URI::InvalidURIError
- nil
- end
- def filename_from_response(res, fallback_id)
- cd = res["content-disposition"].to_s
- if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
- return safe_name($1)
- end
- base = safe_name(fallback_id || "document")
- ct = res["content-type"].to_s
- ext = ct.include?("pdf") ? ".pdf" : ".bin"
- "#{base}#{ext}"
- end
- def download_all(urls, jar, council_reference)
- return [] if urls.empty?
- dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
- FileUtils.mkdir_p(dir)
- saved = []
- first_web_rel = nil
- urls.each_with_index do |u, i|
- begin
- res = dorset_get(jar, u)
- body = res.body.to_s
- fid = id_from_url(u) || "file#{i+1}"
- name = filename_from_response(res, fid)
- path = File.join(dir, name)
- bytes = File.binwrite(path, body)
- puts " saved #{path} (#{bytes} bytes)"
- saved << path
- first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
- rescue StandardError => e
- warn "Download failed for #{u}: #{e.class} #{e.message}"
- end
- end
- if first_web_rel
- begin
- DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
- .execute(first_web_rel, council_reference)
- rescue StandardError => e
- warn "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- saved
- end
- puts "Fetching Dorset list…"
- jar = {}
- list_res = dorset_get(jar, LIST_URL)
- list_html = list_res.body
- list_items = parse_list(list_html)
- puts "Found #{list_items.length} items for #{TABLE}"
- list_items.each do |r|
- next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
- detail_url = abs_url(r[:detail_href])
- doc_urls = []
- stages = []
- saved_paths = []
- if !detail_url.empty?
- begin
- detail_res = dorset_get(jar, detail_url)
- detail_html = detail_res.body
- # documents
- doc_urls = extract_doc_links(detail_html)
- saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
- # stages
- stages = parse_tasks(detail_html)
- # prefer Advertising/Public Notif dates if they exist
- if r[:on_notice_to].nil?
- adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
- if adv
- r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
- r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
- end
- end
- rescue StandardError => e
- warn "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
- end
- end
- representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
- # geocode
- geo = nil
- begin
- geo = Geocode.format_au(r[:address])
- rescue StandardError => e
- warn "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
- end
-
- council_reference = r[:council_reference][0,100]
- address = r[:address][0,255]
- # upsert main row
- DB.upsert(TABLE, {
- description: r[:description],
- date_received: r[:date_received],
- date_received_raw: r[:date_received_raw],
- on_notice_to: r[:on_notice_to],
- on_notice_to_raw: r[:on_notice_to_raw],
- address: address,
- council_reference: council_reference,
- applicant: r[:applicant],
- owner: r[:owner]
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- )
-
- tn = DB.client.escape(TABLE)
- sql = %Q{
- SELECT address_std, lat, lng
- FROM `#{tn}`
- WHERE council_reference = ? AND address = ?
- LIMIT 1
- }
- begin
- row = DB.client.prepare(sql).execute(council_reference, address).first
- puts " enriched -> #{row ? row.inspect : 'nil'}"
- rescue StandardError => e
- warn " enriched probe failed: #{e.class} #{e.message}"
- end
- puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
- end
- puts "Done #{TABLE}."
|