# scrapers/dorset.rb require "date" require "nokogiri" require "uri" require "fileutils" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/geocode" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") BASE_HTTPS = "https://eservices.dorset.tas.gov.au" BASE_HTTP = "http://eservices.dorset.tas.gov.au" # Pick one LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534" #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535" DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" DB.ensure_table!(TABLE) ensure_extra_columns!(TABLE) def abs_url(href) return "" if href.to_s.strip.empty? URI.join(BASE_HTTPS, href).to_s rescue URI::InvalidURIError href.to_s end def dorset_get(jar, url) [BASE_HTTPS, BASE_HTTP].each do |base| begin Http.request(URI.parse("#{base}/"), headers: {}, jar: jar) Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/") tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base)) res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/") if res.is_a?(Net::HTTPRedirection) && res["location"] res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/") end return res if res.is_a?(Net::HTTPSuccess) rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout next end end raise "Dorset fetch failed for #{url}" end def parse_list(html) doc = Nokogiri::HTML(html) out = [] doc.css("h4.non_table_headers a").each do |a| address = a.text.to_s.strip href = a["href"].to_s entry = a.ancestors("h4").first&.next_element description = "" date_received_raw = "" council_reference = "" applicant = "" owner = "" if entry entry.css(".rowDataOnly").each do |p| spans = p.css("span") next unless spans.length == 2 key = spans[0].text.to_s.strip val = spans[1].text.to_s.strip case key when "Type of Work" then description = val when "Date Lodged" then date_received_raw = val when "Application No." then council_reference = val when "Applicant" then applicant = val when "Owner" then owner = val end end end lodged_dt = Util.parse_aus_date(date_received_raw) on_to_dt = lodged_dt ? (lodged_dt + 14) : nil out << { address: address, detail_href: href, description: description.empty? ? "Development Application" : description, date_received_raw: date_received_raw, date_received: lodged_dt, on_notice_to: on_to_dt, on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "", council_reference: council_reference, applicant: applicant, owner: owner } end out end def extract_doc_links(detail_html) doc = Nokogiri::HTML(detail_html) links = [] t = doc.css('table[summary]').find { |tbl| tbl["summary"].to_s.downcase.include?("electronic document") } if t links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } end links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty? links.map { |h| abs_url(h) }.uniq end # --- new: parse the tasks/milestones table --- def parse_tasks(detail_html) doc = Nokogiri::HTML(detail_html) t = doc.css('table[summary]').find { |tbl| tbl["summary"].to_s.downcase.include?("tasks associated") } return [] unless t out = [] t.css("tr")[1..]&.each do |tr| tds = tr.css("td") next if tds.empty? stage_desc = tds[1]&.text.to_s.strip opened_raw = tds[2]&.text.to_s.strip target_raw = tds[3]&.text.to_s.strip completed_raw = tds[4]&.text.to_s.strip status = tds[5]&.text.to_s.strip out << { stage_description: stage_desc, opened_raw: opened_raw, opened_date: Util.parse_aus_date(opened_raw), target_raw: target_raw, target_date: Util.parse_aus_date(target_raw), completed_raw: completed_raw, completed_date: Util.parse_aus_date(completed_raw), status: status } end out end def ensure_stages_table!(table) tn = "#{table}_stages" DB.client.query(<<~SQL) CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` ( id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, council_reference VARCHAR(100) NOT NULL, address VARCHAR(255) NOT NULL, stage_description VARCHAR(255) NOT NULL, opened_date DATE NULL, opened_raw VARCHAR(50) NULL, target_date DATE NULL, target_raw VARCHAR(50) NULL, completed_date DATE NULL, completed_raw VARCHAR(50) NULL, status VARCHAR(100) NULL, created_at DATETIME NOT NULL, updated_at DATETIME NOT NULL, PRIMARY KEY (id), UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 SQL end def save_stages(table, ref, addr, stages) return if stages.empty? tn = "#{table}_stages" ensure_stages_table!(table) DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?") .execute(ref, addr) ins = DB.client.prepare(<<~SQL) INSERT INTO `#{DB.client.escape(tn)}` (council_reference, address, stage_description, opened_date, opened_raw, target_date, target_raw, completed_date, completed_raw, status, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW()) SQL stages.each do |s| ins.execute( ref, addr, s[:stage_description][0,255], s[:opened_date], s[:opened_raw][0,50], s[:target_date], s[:target_raw][0,50], s[:completed_date], s[:completed_raw][0,50], s[:status][0,100] ) end end def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_") def id_from_url(u) uri = URI.parse(u) q = uri.query.to_s q[/\bid=([^&]+)/, 1] || File.basename(uri.path) rescue URI::InvalidURIError nil end def filename_from_response(res, fallback_id) cd = res["content-disposition"].to_s if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/ return safe_name($1) end base = safe_name(fallback_id || "document") ct = res["content-type"].to_s ext = ct.include?("pdf") ? ".pdf" : ".bin" "#{base}#{ext}" end def download_all(urls, jar, council_reference) return [] if urls.empty? dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference)) FileUtils.mkdir_p(dir) saved = [] first_web_rel = nil urls.each_with_index do |u, i| begin res = dorset_get(jar, u) body = res.body.to_s fid = id_from_url(u) || "file#{i+1}" name = filename_from_response(res, fid) path = File.join(dir, name) bytes = File.binwrite(path, body) puts " saved #{path} (#{bytes} bytes)" saved << path first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}" rescue => e warn "Download failed for #{u}: #{e.class} #{e.message}" end end if first_web_rel begin DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?") .execute(first_web_rel, council_reference) rescue => e warn "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}" end end saved end puts "Fetching Dorset list…" jar = {} list_res = dorset_get(jar, LIST_URL) list_html = list_res.body list_items = parse_list(list_html) puts "Found #{list_items.length} items for #{TABLE}" list_items.each do |r| next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty? detail_url = abs_url(r[:detail_href]) doc_urls = [] stages = [] saved_paths = [] if !detail_url.empty? begin detail_res = dorset_get(jar, detail_url) detail_html = detail_res.body # documents doc_urls = extract_doc_links(detail_html) saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : [] # stages stages = parse_tasks(detail_html) # prefer Advertising/Public Notif dates if they exist if r[:on_notice_to].nil? adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") } if adv r[:on_notice_to] = adv[:completed_date] || adv[:target_date] r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw] end end rescue => e warn "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}" end end representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s # geocode geo = nil begin geo = Geocode.format_au(r[:address]) rescue => e warn "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}" end council_reference = r[:council_reference][0,100] address = r[:address][0,255] # upsert main row DB.upsert(TABLE, { description: r[:description], date_received: r[:date_received], date_received_raw: r[:date_received_raw], on_notice_to: r[:on_notice_to], on_notice_to_raw: r[:on_notice_to_raw], address: address, council_reference: council_reference, applicant: r[:applicant], owner: r[:owner] }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) tn = DB.client.escape(TABLE) sql = %Q{ SELECT address_std, lat, lng FROM `#{tn}` WHERE council_reference = ? AND address = ? LIMIT 1 } begin row = DB.client.prepare(sql).execute(council_reference, address).first puts " enriched -> #{row ? row.inspect : 'nil'}" rescue => e warn " enriched probe failed: #{e.class} #{e.message}" end puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}" end puts "Done #{TABLE}."