| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- # tools/backfill_dorset_docs.rb
- # Fill missing Dorset document links for existing rows.
- require "nokogiri"
- require "uri"
- require "fileutils"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- TABLE = ENV.fetch("TABLE_NAME", "da_dorset")
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
- BASE_HTTP = "http://eservices.dorset.tas.gov.au"
- LISTS = [
- "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534",
- "#{BASE_HTTPS}/eservice/daEnquiry/recentlyLodged.do?num_days=4000&nodeNum=19533",
- "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=4000&nodeNum=19535",
- ]
- def abs_url(href)
- return "" if href.to_s.strip.empty?
- URI.join(BASE_HTTPS, href).to_s
- rescue
- href.to_s
- end
- # same warm session fetch as in dorset.rb
- def dorset_get(jar, url)
- [BASE_HTTPS, BASE_HTTP].each do |base|
- begin
- Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
- Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
- tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
- res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
- if res.is_a?(Net::HTTPRedirection) && res["location"]
- res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
- end
- return res if res.is_a?(Net::HTTPSuccess)
- rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
- next
- end
- end
- raise "Dorset fetch failed for #{url}"
- end
- def parse_list(html)
- doc = Nokogiri::HTML(html)
- out = []
- doc.css("h4.non_table_headers a").each do |a|
- address = a.text.to_s.strip
- href = a["href"].to_s
- entry = a.ancestors("h4").first&.next_element
- description = ""
- date_received_raw = ""
- council_reference = ""
- if entry
- entry.css(".rowDataOnly").each do |p|
- spans = p.css("span")
- next unless spans.length == 2
- key = spans[0].text.to_s.strip
- val = spans[1].text.to_s.strip
- case key
- when "Type of Work" then description = val
- when "Date Lodged" then date_received_raw = val
- when "Application No." then council_reference = val
- end
- end
- end
- next if council_reference.to_s.strip.empty?
- out << {
- council_reference: council_reference,
- address: address,
- detail_href: href,
- description: description,
- date_received_raw: date_received_raw
- }
- end
- out
- end
- def extract_doc_links(detail_html)
- doc = Nokogiri::HTML(detail_html)
- links = []
- t = doc.css('table[summary]').find { |tbl|
- tbl["summary"].to_s.downcase.include?("electronic document")
- }
- links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if t
- links += doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
- links.map { |h| abs_url(h) }.uniq
- end
- def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
- def id_from_url(u)
- uri = URI.parse(u)
- q = uri.query.to_s
- q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
- rescue
- nil
- end
- def filename_from_response(res, fallback_id)
- cd = res["content-disposition"].to_s
- if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
- return safe_name($1)
- end
- base = safe_name(fallback_id || "document")
- ct = res["content-type"].to_s
- ext = ct.include?("pdf") ? ".pdf" : ".bin"
- "#{base}#{ext}"
- end
- def download_all(urls, jar, council_reference)
- return [], [] if urls.empty?
- dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
- FileUtils.mkdir_p(dir)
- saved_abs = []
- saved_web = []
- urls.each_with_index do |u, i|
- begin
- res = dorset_get(jar, u)
- body = res.body.to_s
- fid = id_from_url(u) || "file#{i+1}"
- name = filename_from_response(res, fid)
- path = File.join(dir, name)
- File.binwrite(path, body)
- saved_abs << path
- # map to web path. Your web exposes downloads at /files
- saved_web << path.sub(DOWNLOAD_DIR, "/files")
- puts " saved #{File.basename(path)} (#{body.bytesize} bytes)"
- rescue => e
- warn "Download failed for #{u}: #{e.class} #{e.message}"
- end
- end
- [saved_abs, saved_web]
- end
- def column_exists?(table, col)
- te = DB.client.escape(table)
- ce = DB.client.escape(col)
- DB.client.query("SHOW COLUMNS FROM `#{te}` LIKE '#{ce}'").any?
- end
- # Build a big index of current and historical list items
- puts "Building Dorset index from list pages…"
- jar = {}
- index_by_ref = {}
- LISTS.each do |url|
- begin
- res = dorset_get(jar, url)
- items = parse_list(res.body)
- items.each do |it|
- index_by_ref[it[:council_reference]] ||= []
- index_by_ref[it[:council_reference]] << abs_url(it[:detail_href])
- end
- puts " #{url} -> #{items.length} items"
- rescue => e
- warn "List fetch failed #{url}: #{e.class} #{e.message}"
- end
- end
- puts "Indexed refs: #{index_by_ref.keys.length}"
- # Find rows needing backfill
- needs_local_col = !column_exists?(TABLE, "local_document_url")
- begin
- # add local_document_url if you want it
- unless needs_local_col
- # already exists
- else
- te = DB.client.escape(TABLE)
- DB.client.query("ALTER TABLE `#{te}` ADD COLUMN local_document_url TEXT NULL")
- needs_local_col = false
- end
- rescue => e
- warn "Could not add local_document_url column: #{e.class} #{e.message}"
- end
- te = DB.client.escape(TABLE)
- rows = DB.client.query(<<~SQL).to_a
- SELECT council_reference, address, COALESCE(document_url,'') AS document_url,
- COALESCE(local_document_url,'') AS local_document_url
- FROM `#{te}`
- SQL
- todo = rows.select { |r|
- r["local_document_url"].to_s.strip.empty? &&
- (r["document_url"].to_s.strip.empty? || r["document_url"].to_s.start_with?("http"))
- }
- puts "Rows in table: #{rows.length}, to backfill: #{todo.length}"
- todo.each_with_index do |r, i|
- ref = r["council_reference"].to_s
- add = r["address"].to_s
- detail_candidates = index_by_ref[ref] || []
- if detail_candidates.empty?
- puts "[#{i+1}/#{todo.length}] #{ref} -> no detail page found on current lists, skip"
- next
- end
- # Try each candidate until we manage to extract docs
- doc_urls = []
- detail_candidates.uniq.each do |detail_url|
- begin
- res = dorset_get(jar, detail_url)
- doc_urls = extract_doc_links(res.body)
- break unless doc_urls.empty?
- rescue => e
- warn "Detail fetch failed #{detail_url}: #{e.class} #{e.message}"
- next
- end
- end
- if doc_urls.empty?
- puts "[#{i+1}/#{todo.length}] #{ref} -> no documents"
- next
- end
- saved_abs, saved_web = download_all(doc_urls, jar, ref)
- rep_web = saved_web.first.to_s
- rep_remote = doc_urls.first.to_s
- begin
- sql =
- if column_exists?(TABLE, "local_document_url")
- "UPDATE `#{te}` SET local_document_url = ?, document_url = IF(document_url='', ?, document_url) WHERE council_reference = ? AND address = ?"
- else
- # fallback: write local into document_url
- "UPDATE `#{te}` SET document_url = ? WHERE council_reference = ? AND address = ?"
- end
- stmt = DB.client.prepare(sql)
- if sql.include?("local_document_url")
- stmt.execute(rep_web, rep_remote, ref, add)
- else
- stmt.execute(rep_web, ref, add)
- end
- puts "[#{i+1}/#{todo.length}] #{ref} -> saved #{saved_abs.length} file(s)"
- rescue => e
- warn "Update failed for #{ref}: #{e.class} #{e.message}"
- end
- end
- puts "Backfill done."
|