# tools/backfill_dorset_docs.rb # Fill missing Dorset document links for existing rows. require "nokogiri" require "uri" require "fileutils" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME", "da_dorset") DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" BASE_HTTPS = "https://eservices.dorset.tas.gov.au" BASE_HTTP = "http://eservices.dorset.tas.gov.au" LISTS = [ "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534", "#{BASE_HTTPS}/eservice/daEnquiry/recentlyLodged.do?num_days=4000&nodeNum=19533", "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=4000&nodeNum=19535", ] def abs_url(href) return "" if href.to_s.strip.empty? URI.join(BASE_HTTPS, href).to_s rescue href.to_s end # same warm session fetch as in dorset.rb def dorset_get(jar, url) [BASE_HTTPS, BASE_HTTP].each do |base| begin Http.request(URI.parse("#{base}/"), headers: {}, jar: jar) Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/") tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base)) res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/") if res.is_a?(Net::HTTPRedirection) && res["location"] res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/") end return res if res.is_a?(Net::HTTPSuccess) rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout next end end raise "Dorset fetch failed for #{url}" end def parse_list(html) doc = Nokogiri::HTML(html) out = [] doc.css("h4.non_table_headers a").each do |a| address = a.text.to_s.strip href = a["href"].to_s entry = a.ancestors("h4").first&.next_element description = "" date_received_raw = "" council_reference = "" if entry entry.css(".rowDataOnly").each do |p| spans = p.css("span") next unless spans.length == 2 key = spans[0].text.to_s.strip val = spans[1].text.to_s.strip case key when "Type of Work" then description = val when "Date Lodged" then date_received_raw = val when "Application No." then council_reference = val end end end next if council_reference.to_s.strip.empty? out << { council_reference: council_reference, address: address, detail_href: href, description: description, date_received_raw: date_received_raw } end out end def extract_doc_links(detail_html) doc = Nokogiri::HTML(detail_html) links = [] t = doc.css('table[summary]').find { |tbl| tbl["summary"].to_s.downcase.include?("electronic document") } links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if t links += doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty? links.map { |h| abs_url(h) }.uniq end def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_") def id_from_url(u) uri = URI.parse(u) q = uri.query.to_s q[/\bid=([^&]+)/, 1] || File.basename(uri.path) rescue nil end def filename_from_response(res, fallback_id) cd = res["content-disposition"].to_s if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/ return safe_name($1) end base = safe_name(fallback_id || "document") ct = res["content-type"].to_s ext = ct.include?("pdf") ? ".pdf" : ".bin" "#{base}#{ext}" end def download_all(urls, jar, council_reference) return [], [] if urls.empty? dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference)) FileUtils.mkdir_p(dir) saved_abs = [] saved_web = [] urls.each_with_index do |u, i| begin res = dorset_get(jar, u) body = res.body.to_s fid = id_from_url(u) || "file#{i+1}" name = filename_from_response(res, fid) path = File.join(dir, name) File.binwrite(path, body) saved_abs << path # map to web path. Your web exposes downloads at /files saved_web << path.sub(DOWNLOAD_DIR, "/files") puts " saved #{File.basename(path)} (#{body.bytesize} bytes)" rescue StandardError => e warn "Download failed for #{u}: #{e.class} #{e.message}" end end [saved_abs, saved_web] end def column_exists?(table, col) te = DB.client.escape(table) ce = DB.client.escape(col) DB.client.query("SHOW COLUMNS FROM `#{te}` LIKE '#{ce}'").any? end # Build a big index of current and historical list items puts "Building Dorset index from list pages…" jar = {} index_by_ref = {} LISTS.each do |url| begin res = dorset_get(jar, url) items = parse_list(res.body) items.each do |it| index_by_ref[it[:council_reference]] ||= [] index_by_ref[it[:council_reference]] << abs_url(it[:detail_href]) end puts " #{url} -> #{items.length} items" rescue StandardError => e warn "List fetch failed #{url}: #{e.class} #{e.message}" end end puts "Indexed refs: #{index_by_ref.keys.length}" # Find rows needing backfill needs_local_col = !column_exists?(TABLE, "local_document_url") begin # add local_document_url if you want it unless needs_local_col # already exists else te = DB.client.escape(TABLE) DB.client.query("ALTER TABLE `#{te}` ADD COLUMN local_document_url TEXT NULL") needs_local_col = false end rescue StandardError => e warn "Could not add local_document_url column: #{e.class} #{e.message}" end te = DB.client.escape(TABLE) rows = DB.client.query(<<~SQL).to_a SELECT council_reference, address, COALESCE(document_url,'') AS document_url, COALESCE(local_document_url,'') AS local_document_url FROM `#{te}` SQL todo = rows.select { |r| r["local_document_url"].to_s.strip.empty? && (r["document_url"].to_s.strip.empty? || r["document_url"].to_s.start_with?("http")) } puts "Rows in table: #{rows.length}, to backfill: #{todo.length}" todo.each_with_index do |r, i| ref = r["council_reference"].to_s add = r["address"].to_s detail_candidates = index_by_ref[ref] || [] if detail_candidates.empty? puts "[#{i+1}/#{todo.length}] #{ref} -> no detail page found on current lists, skip" next end # Try each candidate until we manage to extract docs doc_urls = [] detail_candidates.uniq.each do |detail_url| begin res = dorset_get(jar, detail_url) doc_urls = extract_doc_links(res.body) break unless doc_urls.empty? rescue StandardError => e warn "Detail fetch failed #{detail_url}: #{e.class} #{e.message}" next end end if doc_urls.empty? puts "[#{i+1}/#{todo.length}] #{ref} -> no documents" next end saved_abs, saved_web = download_all(doc_urls, jar, ref) rep_web = saved_web.first.to_s rep_remote = doc_urls.first.to_s begin sql = if column_exists?(TABLE, "local_document_url") "UPDATE `#{te}` SET local_document_url = ?, document_url = IF(document_url='', ?, document_url) WHERE council_reference = ? AND address = ?" else # fallback: write local into document_url "UPDATE `#{te}` SET document_url = ? WHERE council_reference = ? AND address = ?" end stmt = DB.client.prepare(sql) if sql.include?("local_document_url") stmt.execute(rep_web, rep_remote, ref, add) else stmt.execute(rep_web, ref, add) end puts "[#{i+1}/#{todo.length}] #{ref} -> saved #{saved_abs.length} file(s)" rescue StandardError => e warn "Update failed for #{ref}: #{e.class} #{e.message}" end end puts "Backfill done."