dorset.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. # scrapers/dorset.rb
  2. require "date"
  3. require "nokogiri"
  4. require "uri"
  5. require "fileutils"
  6. require_relative "../lib/http"
  7. require_relative "../lib/db"
  8. require_relative "../lib/util"
  9. require_relative "../lib/geocode"
  10. require_relative "../lib/enrich"
  11. TABLE = ENV.fetch("TABLE_NAME")
  12. BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
  13. BASE_HTTP = "http://eservices.dorset.tas.gov.au"
  14. # Pick one
  15. LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
  16. #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
  17. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  18. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  19. DB.ensure_table!(TABLE)
  20. def abs_url(href)
  21. return "" if href.to_s.strip.empty?
  22. URI.join(BASE_HTTPS, href).to_s
  23. rescue URI::InvalidURIError
  24. href.to_s
  25. end
  26. def dorset_get(jar, url)
  27. [BASE_HTTPS, BASE_HTTP].each do |base|
  28. begin
  29. Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
  30. Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
  31. tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
  32. res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  33. if res.is_a?(Net::HTTPRedirection) && res["location"]
  34. res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  35. end
  36. return res if res.is_a?(Net::HTTPSuccess)
  37. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  38. next
  39. end
  40. end
  41. raise "Dorset fetch failed for #{url}"
  42. end
  43. def parse_list(html)
  44. doc = Nokogiri::HTML(html)
  45. out = []
  46. doc.css("h4.non_table_headers a").each do |a|
  47. address = a.text.to_s.strip
  48. href = a["href"].to_s
  49. entry = a.ancestors("h4").first&.next_element
  50. description = ""
  51. date_received_raw = ""
  52. council_reference = ""
  53. applicant = ""
  54. owner = ""
  55. if entry
  56. entry.css(".rowDataOnly").each do |p|
  57. spans = p.css("span")
  58. next unless spans.length == 2
  59. key = spans[0].text.to_s.strip
  60. val = spans[1].text.to_s.strip
  61. case key
  62. when "Type of Work" then description = val
  63. when "Date Lodged" then date_received_raw = val
  64. when "Application No." then council_reference = val
  65. when "Applicant" then applicant = val
  66. when "Owner" then owner = val
  67. end
  68. end
  69. end
  70. lodged_dt = Util.parse_aus_date(date_received_raw)
  71. on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
  72. out << {
  73. address: address,
  74. detail_href: href,
  75. description: description.empty? ? "Development Application" : description,
  76. date_received_raw: date_received_raw,
  77. date_received: lodged_dt,
  78. on_notice_to: on_to_dt,
  79. on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
  80. council_reference: council_reference,
  81. applicant: applicant,
  82. owner: owner
  83. }
  84. end
  85. out
  86. end
  87. def extract_doc_links(detail_html)
  88. doc = Nokogiri::HTML(detail_html)
  89. links = []
  90. t = doc.css('table[summary]').find { |tbl|
  91. tbl["summary"].to_s.downcase.include?("electronic document")
  92. }
  93. if t
  94. links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
  95. end
  96. links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
  97. links.map { |h| abs_url(h) }.uniq
  98. end
  99. # --- new: parse the tasks/milestones table ---
  100. def parse_tasks(detail_html)
  101. doc = Nokogiri::HTML(detail_html)
  102. t = doc.css('table[summary]').find { |tbl|
  103. tbl["summary"].to_s.downcase.include?("tasks associated")
  104. }
  105. return [] unless t
  106. out = []
  107. t.css("tr")[1..]&.each do |tr|
  108. tds = tr.css("td")
  109. next if tds.empty?
  110. stage_desc = tds[1]&.text.to_s.strip
  111. opened_raw = tds[2]&.text.to_s.strip
  112. target_raw = tds[3]&.text.to_s.strip
  113. completed_raw = tds[4]&.text.to_s.strip
  114. status = tds[5]&.text.to_s.strip
  115. out << {
  116. stage_description: stage_desc,
  117. opened_raw: opened_raw,
  118. opened_date: Util.parse_aus_date(opened_raw),
  119. target_raw: target_raw,
  120. target_date: Util.parse_aus_date(target_raw),
  121. completed_raw: completed_raw,
  122. completed_date: Util.parse_aus_date(completed_raw),
  123. status: status
  124. }
  125. end
  126. out
  127. end
  128. def ensure_stages_table!(table)
  129. tn = "#{table}_stages"
  130. DB.client.query(<<~SQL)
  131. CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
  132. id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
  133. council_reference VARCHAR(100) NOT NULL,
  134. address VARCHAR(255) NOT NULL,
  135. stage_description VARCHAR(255) NOT NULL,
  136. opened_date DATE NULL,
  137. opened_raw VARCHAR(50) NULL,
  138. target_date DATE NULL,
  139. target_raw VARCHAR(50) NULL,
  140. completed_date DATE NULL,
  141. completed_raw VARCHAR(50) NULL,
  142. status VARCHAR(100) NULL,
  143. created_at DATETIME NOT NULL,
  144. updated_at DATETIME NOT NULL,
  145. PRIMARY KEY (id),
  146. UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
  147. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
  148. SQL
  149. end
  150. def save_stages(table, ref, addr, stages)
  151. return if stages.empty?
  152. tn = "#{table}_stages"
  153. ensure_stages_table!(table)
  154. DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
  155. .execute(ref, addr)
  156. ins = DB.client.prepare(<<~SQL)
  157. INSERT INTO `#{DB.client.escape(tn)}`
  158. (council_reference, address, stage_description,
  159. opened_date, opened_raw, target_date, target_raw,
  160. completed_date, completed_raw, status, created_at, updated_at)
  161. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
  162. SQL
  163. stages.each do |s|
  164. ins.execute(
  165. ref, addr, s[:stage_description][0,255],
  166. s[:opened_date], s[:opened_raw][0,50],
  167. s[:target_date], s[:target_raw][0,50],
  168. s[:completed_date], s[:completed_raw][0,50],
  169. s[:status][0,100]
  170. )
  171. end
  172. end
  173. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  174. def id_from_url(u)
  175. uri = URI.parse(u)
  176. q = uri.query.to_s
  177. q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
  178. rescue URI::InvalidURIError
  179. nil
  180. end
  181. def filename_from_response(res, fallback_id)
  182. cd = res["content-disposition"].to_s
  183. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  184. return safe_name($1)
  185. end
  186. base = safe_name(fallback_id || "document")
  187. ct = res["content-type"].to_s
  188. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  189. "#{base}#{ext}"
  190. end
  191. def download_all(urls, jar, council_reference)
  192. return [] if urls.empty?
  193. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  194. FileUtils.mkdir_p(dir)
  195. saved = []
  196. first_web_rel = nil
  197. urls.each_with_index do |u, i|
  198. begin
  199. res = dorset_get(jar, u)
  200. body = res.body.to_s
  201. fid = id_from_url(u) || "file#{i+1}"
  202. name = filename_from_response(res, fid)
  203. path = File.join(dir, name)
  204. bytes = File.binwrite(path, body)
  205. puts " saved #{path} (#{bytes} bytes)"
  206. saved << path
  207. first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
  208. rescue StandardError => e
  209. warn "Download failed for #{u}: #{e.class} #{e.message}"
  210. end
  211. end
  212. if first_web_rel
  213. begin
  214. DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
  215. .execute(first_web_rel, council_reference)
  216. rescue StandardError => e
  217. warn "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
  218. end
  219. end
  220. saved
  221. end
  222. puts "Fetching Dorset list…"
  223. jar = {}
  224. list_res = dorset_get(jar, LIST_URL)
  225. list_html = list_res.body
  226. list_items = parse_list(list_html)
  227. puts "Found #{list_items.length} items for #{TABLE}"
  228. list_items.each do |r|
  229. next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
  230. detail_url = abs_url(r[:detail_href])
  231. doc_urls = []
  232. stages = []
  233. saved_paths = []
  234. if !detail_url.empty?
  235. begin
  236. detail_res = dorset_get(jar, detail_url)
  237. detail_html = detail_res.body
  238. # documents
  239. doc_urls = extract_doc_links(detail_html)
  240. saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
  241. # stages
  242. stages = parse_tasks(detail_html)
  243. # prefer Advertising/Public Notif dates if they exist
  244. if r[:on_notice_to].nil?
  245. adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
  246. if adv
  247. r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
  248. r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
  249. end
  250. end
  251. rescue StandardError => e
  252. warn "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  253. end
  254. end
  255. representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
  256. # geocode
  257. geo = nil
  258. begin
  259. geo = Geocode.format_au(r[:address])
  260. rescue StandardError => e
  261. warn "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
  262. end
  263. council_reference = r[:council_reference][0,100]
  264. address = r[:address][0,255]
  265. # upsert main row
  266. DB.upsert(TABLE, {
  267. description: r[:description],
  268. date_received: r[:date_received],
  269. date_received_raw: r[:date_received_raw],
  270. on_notice_to: r[:on_notice_to],
  271. on_notice_to_raw: r[:on_notice_to_raw],
  272. address: address,
  273. council_reference: council_reference,
  274. applicant: r[:applicant],
  275. owner: r[:owner]
  276. })
  277. enrich_after_upsert!(
  278. table: TABLE,
  279. council_reference: council_reference,
  280. address: address
  281. )
  282. tn = DB.client.escape(TABLE)
  283. sql = %Q{
  284. SELECT address_std, lat, lng
  285. FROM `#{tn}`
  286. WHERE council_reference = ? AND address = ?
  287. LIMIT 1
  288. }
  289. begin
  290. row = DB.client.prepare(sql).execute(council_reference, address).first
  291. puts " enriched -> #{row ? row.inspect : 'nil'}"
  292. rescue StandardError => e
  293. warn " enriched probe failed: #{e.class} #{e.message}"
  294. end
  295. puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
  296. end
  297. puts "Done #{TABLE}."