backfill_dorset_docs.rb 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # tools/backfill_dorset_docs.rb
  2. # Fill missing Dorset document links for existing rows.
  3. require "nokogiri"
  4. require "uri"
  5. require "fileutils"
  6. require_relative "../lib/http"
  7. require_relative "../lib/db"
  8. require_relative "../lib/util"
  9. TABLE = ENV.fetch("TABLE_NAME", "da_dorset")
  10. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  11. BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
  12. BASE_HTTP = "http://eservices.dorset.tas.gov.au"
  13. LISTS = [
  14. "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534",
  15. "#{BASE_HTTPS}/eservice/daEnquiry/recentlyLodged.do?num_days=4000&nodeNum=19533",
  16. "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=4000&nodeNum=19535",
  17. ]
  18. def abs_url(href)
  19. return "" if href.to_s.strip.empty?
  20. URI.join(BASE_HTTPS, href).to_s
  21. rescue
  22. href.to_s
  23. end
  24. # same warm session fetch as in dorset.rb
  25. def dorset_get(jar, url)
  26. [BASE_HTTPS, BASE_HTTP].each do |base|
  27. begin
  28. Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
  29. Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
  30. tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
  31. res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  32. if res.is_a?(Net::HTTPRedirection) && res["location"]
  33. res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  34. end
  35. return res if res.is_a?(Net::HTTPSuccess)
  36. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  37. next
  38. end
  39. end
  40. raise "Dorset fetch failed for #{url}"
  41. end
  42. def parse_list(html)
  43. doc = Nokogiri::HTML(html)
  44. out = []
  45. doc.css("h4.non_table_headers a").each do |a|
  46. address = a.text.to_s.strip
  47. href = a["href"].to_s
  48. entry = a.ancestors("h4").first&.next_element
  49. description = ""
  50. date_received_raw = ""
  51. council_reference = ""
  52. if entry
  53. entry.css(".rowDataOnly").each do |p|
  54. spans = p.css("span")
  55. next unless spans.length == 2
  56. key = spans[0].text.to_s.strip
  57. val = spans[1].text.to_s.strip
  58. case key
  59. when "Type of Work" then description = val
  60. when "Date Lodged" then date_received_raw = val
  61. when "Application No." then council_reference = val
  62. end
  63. end
  64. end
  65. next if council_reference.to_s.strip.empty?
  66. out << {
  67. council_reference: council_reference,
  68. address: address,
  69. detail_href: href,
  70. description: description,
  71. date_received_raw: date_received_raw
  72. }
  73. end
  74. out
  75. end
  76. def extract_doc_links(detail_html)
  77. doc = Nokogiri::HTML(detail_html)
  78. links = []
  79. t = doc.css('table[summary]').find { |tbl|
  80. tbl["summary"].to_s.downcase.include?("electronic document")
  81. }
  82. links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if t
  83. links += doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
  84. links.map { |h| abs_url(h) }.uniq
  85. end
  86. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  87. def id_from_url(u)
  88. uri = URI.parse(u)
  89. q = uri.query.to_s
  90. q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
  91. rescue
  92. nil
  93. end
  94. def filename_from_response(res, fallback_id)
  95. cd = res["content-disposition"].to_s
  96. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  97. return safe_name($1)
  98. end
  99. base = safe_name(fallback_id || "document")
  100. ct = res["content-type"].to_s
  101. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  102. "#{base}#{ext}"
  103. end
  104. def download_all(urls, jar, council_reference)
  105. return [], [] if urls.empty?
  106. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  107. FileUtils.mkdir_p(dir)
  108. saved_abs = []
  109. saved_web = []
  110. urls.each_with_index do |u, i|
  111. begin
  112. res = dorset_get(jar, u)
  113. body = res.body.to_s
  114. fid = id_from_url(u) || "file#{i+1}"
  115. name = filename_from_response(res, fid)
  116. path = File.join(dir, name)
  117. File.binwrite(path, body)
  118. saved_abs << path
  119. # map to web path. Your web exposes downloads at /files
  120. saved_web << path.sub(DOWNLOAD_DIR, "/files")
  121. puts " saved #{File.basename(path)} (#{body.bytesize} bytes)"
  122. rescue => e
  123. warn "Download failed for #{u}: #{e.class} #{e.message}"
  124. end
  125. end
  126. [saved_abs, saved_web]
  127. end
  128. def column_exists?(table, col)
  129. te = DB.client.escape(table)
  130. ce = DB.client.escape(col)
  131. DB.client.query("SHOW COLUMNS FROM `#{te}` LIKE '#{ce}'").any?
  132. end
  133. # Build a big index of current and historical list items
  134. puts "Building Dorset index from list pages…"
  135. jar = {}
  136. index_by_ref = {}
  137. LISTS.each do |url|
  138. begin
  139. res = dorset_get(jar, url)
  140. items = parse_list(res.body)
  141. items.each do |it|
  142. index_by_ref[it[:council_reference]] ||= []
  143. index_by_ref[it[:council_reference]] << abs_url(it[:detail_href])
  144. end
  145. puts " #{url} -> #{items.length} items"
  146. rescue => e
  147. warn "List fetch failed #{url}: #{e.class} #{e.message}"
  148. end
  149. end
  150. puts "Indexed refs: #{index_by_ref.keys.length}"
  151. # Find rows needing backfill
  152. needs_local_col = !column_exists?(TABLE, "local_document_url")
  153. begin
  154. # add local_document_url if you want it
  155. unless needs_local_col
  156. # already exists
  157. else
  158. te = DB.client.escape(TABLE)
  159. DB.client.query("ALTER TABLE `#{te}` ADD COLUMN local_document_url TEXT NULL")
  160. needs_local_col = false
  161. end
  162. rescue => e
  163. warn "Could not add local_document_url column: #{e.class} #{e.message}"
  164. end
  165. te = DB.client.escape(TABLE)
  166. rows = DB.client.query(<<~SQL).to_a
  167. SELECT council_reference, address, COALESCE(document_url,'') AS document_url,
  168. COALESCE(local_document_url,'') AS local_document_url
  169. FROM `#{te}`
  170. SQL
  171. todo = rows.select { |r|
  172. r["local_document_url"].to_s.strip.empty? &&
  173. (r["document_url"].to_s.strip.empty? || r["document_url"].to_s.start_with?("http"))
  174. }
  175. puts "Rows in table: #{rows.length}, to backfill: #{todo.length}"
  176. todo.each_with_index do |r, i|
  177. ref = r["council_reference"].to_s
  178. add = r["address"].to_s
  179. detail_candidates = index_by_ref[ref] || []
  180. if detail_candidates.empty?
  181. puts "[#{i+1}/#{todo.length}] #{ref} -> no detail page found on current lists, skip"
  182. next
  183. end
  184. # Try each candidate until we manage to extract docs
  185. doc_urls = []
  186. detail_candidates.uniq.each do |detail_url|
  187. begin
  188. res = dorset_get(jar, detail_url)
  189. doc_urls = extract_doc_links(res.body)
  190. break unless doc_urls.empty?
  191. rescue => e
  192. warn "Detail fetch failed #{detail_url}: #{e.class} #{e.message}"
  193. next
  194. end
  195. end
  196. if doc_urls.empty?
  197. puts "[#{i+1}/#{todo.length}] #{ref} -> no documents"
  198. next
  199. end
  200. saved_abs, saved_web = download_all(doc_urls, jar, ref)
  201. rep_web = saved_web.first.to_s
  202. rep_remote = doc_urls.first.to_s
  203. begin
  204. sql =
  205. if column_exists?(TABLE, "local_document_url")
  206. "UPDATE `#{te}` SET local_document_url = ?, document_url = IF(document_url='', ?, document_url) WHERE council_reference = ? AND address = ?"
  207. else
  208. # fallback: write local into document_url
  209. "UPDATE `#{te}` SET document_url = ? WHERE council_reference = ? AND address = ?"
  210. end
  211. stmt = DB.client.prepare(sql)
  212. if sql.include?("local_document_url")
  213. stmt.execute(rep_web, rep_remote, ref, add)
  214. else
  215. stmt.execute(rep_web, ref, add)
  216. end
  217. puts "[#{i+1}/#{todo.length}] #{ref} -> saved #{saved_abs.length} file(s)"
  218. rescue => e
  219. warn "Update failed for #{ref}: #{e.class} #{e.message}"
  220. end
  221. end
  222. puts "Backfill done."