dorset.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. # scrapers/dorset.rb
  2. require "date"
  3. require "nokogiri"
  4. require "uri"
  5. require "fileutils"
  6. require_relative "../lib/http"
  7. require_relative "../lib/db"
  8. require_relative "../lib/util"
  9. require_relative "../lib/geocode"
  10. require_relative "../lib/enrich"
  11. TABLE = ENV.fetch("TABLE_NAME")
  12. BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
  13. BASE_HTTP = "http://eservices.dorset.tas.gov.au"
  14. # Pick one
  15. LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
  16. #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
  17. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  18. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  19. DB.ensure_table!(TABLE)
  20. ensure_extra_columns!(TABLE)
  21. def abs_url(href)
  22. return "" if href.to_s.strip.empty?
  23. URI.join(BASE_HTTPS, href).to_s
  24. rescue URI::InvalidURIError
  25. href.to_s
  26. end
  27. def dorset_get(jar, url)
  28. [BASE_HTTPS, BASE_HTTP].each do |base|
  29. begin
  30. Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
  31. Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
  32. tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
  33. res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  34. if res.is_a?(Net::HTTPRedirection) && res["location"]
  35. res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  36. end
  37. return res if res.is_a?(Net::HTTPSuccess)
  38. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  39. next
  40. end
  41. end
  42. raise "Dorset fetch failed for #{url}"
  43. end
  44. def parse_list(html)
  45. doc = Nokogiri::HTML(html)
  46. out = []
  47. doc.css("h4.non_table_headers a").each do |a|
  48. address = a.text.to_s.strip
  49. href = a["href"].to_s
  50. entry = a.ancestors("h4").first&.next_element
  51. description = ""
  52. date_received_raw = ""
  53. council_reference = ""
  54. applicant = ""
  55. owner = ""
  56. if entry
  57. entry.css(".rowDataOnly").each do |p|
  58. spans = p.css("span")
  59. next unless spans.length == 2
  60. key = spans[0].text.to_s.strip
  61. val = spans[1].text.to_s.strip
  62. case key
  63. when "Type of Work" then description = val
  64. when "Date Lodged" then date_received_raw = val
  65. when "Application No." then council_reference = val
  66. when "Applicant" then applicant = val
  67. when "Owner" then owner = val
  68. end
  69. end
  70. end
  71. lodged_dt = Util.parse_aus_date(date_received_raw)
  72. on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
  73. out << {
  74. address: address,
  75. detail_href: href,
  76. description: description.empty? ? "Development Application" : description,
  77. date_received_raw: date_received_raw,
  78. date_received: lodged_dt,
  79. on_notice_to: on_to_dt,
  80. on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
  81. council_reference: council_reference,
  82. applicant: applicant,
  83. owner: owner
  84. }
  85. end
  86. out
  87. end
  88. def extract_doc_links(detail_html)
  89. doc = Nokogiri::HTML(detail_html)
  90. links = []
  91. t = doc.css('table[summary]').find { |tbl|
  92. tbl["summary"].to_s.downcase.include?("electronic document")
  93. }
  94. if t
  95. links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
  96. end
  97. links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
  98. links.map { |h| abs_url(h) }.uniq
  99. end
  100. # --- new: parse the tasks/milestones table ---
  101. def parse_tasks(detail_html)
  102. doc = Nokogiri::HTML(detail_html)
  103. t = doc.css('table[summary]').find { |tbl|
  104. tbl["summary"].to_s.downcase.include?("tasks associated")
  105. }
  106. return [] unless t
  107. out = []
  108. t.css("tr")[1..]&.each do |tr|
  109. tds = tr.css("td")
  110. next if tds.empty?
  111. stage_desc = tds[1]&.text.to_s.strip
  112. opened_raw = tds[2]&.text.to_s.strip
  113. target_raw = tds[3]&.text.to_s.strip
  114. completed_raw = tds[4]&.text.to_s.strip
  115. status = tds[5]&.text.to_s.strip
  116. out << {
  117. stage_description: stage_desc,
  118. opened_raw: opened_raw,
  119. opened_date: Util.parse_aus_date(opened_raw),
  120. target_raw: target_raw,
  121. target_date: Util.parse_aus_date(target_raw),
  122. completed_raw: completed_raw,
  123. completed_date: Util.parse_aus_date(completed_raw),
  124. status: status
  125. }
  126. end
  127. out
  128. end
  129. def ensure_stages_table!(table)
  130. tn = "#{table}_stages"
  131. DB.client.query(<<~SQL)
  132. CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
  133. id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
  134. council_reference VARCHAR(100) NOT NULL,
  135. address VARCHAR(255) NOT NULL,
  136. stage_description VARCHAR(255) NOT NULL,
  137. opened_date DATE NULL,
  138. opened_raw VARCHAR(50) NULL,
  139. target_date DATE NULL,
  140. target_raw VARCHAR(50) NULL,
  141. completed_date DATE NULL,
  142. completed_raw VARCHAR(50) NULL,
  143. status VARCHAR(100) NULL,
  144. created_at DATETIME NOT NULL,
  145. updated_at DATETIME NOT NULL,
  146. PRIMARY KEY (id),
  147. UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
  148. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
  149. SQL
  150. end
  151. def save_stages(table, ref, addr, stages)
  152. return if stages.empty?
  153. tn = "#{table}_stages"
  154. ensure_stages_table!(table)
  155. DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
  156. .execute(ref, addr)
  157. ins = DB.client.prepare(<<~SQL)
  158. INSERT INTO `#{DB.client.escape(tn)}`
  159. (council_reference, address, stage_description,
  160. opened_date, opened_raw, target_date, target_raw,
  161. completed_date, completed_raw, status, created_at, updated_at)
  162. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
  163. SQL
  164. stages.each do |s|
  165. ins.execute(
  166. ref, addr, s[:stage_description][0,255],
  167. s[:opened_date], s[:opened_raw][0,50],
  168. s[:target_date], s[:target_raw][0,50],
  169. s[:completed_date], s[:completed_raw][0,50],
  170. s[:status][0,100]
  171. )
  172. end
  173. end
  174. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  175. def id_from_url(u)
  176. uri = URI.parse(u)
  177. q = uri.query.to_s
  178. q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
  179. rescue URI::InvalidURIError
  180. nil
  181. end
  182. def filename_from_response(res, fallback_id)
  183. cd = res["content-disposition"].to_s
  184. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  185. return safe_name($1)
  186. end
  187. base = safe_name(fallback_id || "document")
  188. ct = res["content-type"].to_s
  189. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  190. "#{base}#{ext}"
  191. end
  192. def download_all(urls, jar, council_reference)
  193. return [] if urls.empty?
  194. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  195. FileUtils.mkdir_p(dir)
  196. saved = []
  197. first_web_rel = nil
  198. urls.each_with_index do |u, i|
  199. begin
  200. res = dorset_get(jar, u)
  201. body = res.body.to_s
  202. fid = id_from_url(u) || "file#{i+1}"
  203. name = filename_from_response(res, fid)
  204. path = File.join(dir, name)
  205. bytes = File.binwrite(path, body)
  206. puts " saved #{path} (#{bytes} bytes)"
  207. saved << path
  208. first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
  209. rescue => e
  210. warn "Download failed for #{u}: #{e.class} #{e.message}"
  211. end
  212. end
  213. if first_web_rel
  214. begin
  215. DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
  216. .execute(first_web_rel, council_reference)
  217. rescue => e
  218. warn "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
  219. end
  220. end
  221. saved
  222. end
  223. puts "Fetching Dorset list…"
  224. jar = {}
  225. list_res = dorset_get(jar, LIST_URL)
  226. list_html = list_res.body
  227. list_items = parse_list(list_html)
  228. puts "Found #{list_items.length} items for #{TABLE}"
  229. list_items.each do |r|
  230. next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
  231. detail_url = abs_url(r[:detail_href])
  232. doc_urls = []
  233. stages = []
  234. saved_paths = []
  235. if !detail_url.empty?
  236. begin
  237. detail_res = dorset_get(jar, detail_url)
  238. detail_html = detail_res.body
  239. # documents
  240. doc_urls = extract_doc_links(detail_html)
  241. saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
  242. # stages
  243. stages = parse_tasks(detail_html)
  244. # prefer Advertising/Public Notif dates if they exist
  245. if r[:on_notice_to].nil?
  246. adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
  247. if adv
  248. r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
  249. r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
  250. end
  251. end
  252. rescue => e
  253. warn "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  254. end
  255. end
  256. representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
  257. # geocode
  258. geo = nil
  259. begin
  260. geo = Geocode.format_au(r[:address])
  261. rescue => e
  262. warn "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
  263. end
  264. council_reference = r[:council_reference][0,100]
  265. address = r[:address][0,255]
  266. # upsert main row
  267. DB.upsert(TABLE, {
  268. description: r[:description],
  269. date_received: r[:date_received],
  270. date_received_raw: r[:date_received_raw],
  271. on_notice_to: r[:on_notice_to],
  272. on_notice_to_raw: r[:on_notice_to_raw],
  273. address: address,
  274. council_reference: council_reference,
  275. applicant: r[:applicant],
  276. owner: r[:owner]
  277. })
  278. enrich_after_upsert!(
  279. table: TABLE,
  280. council_reference: council_reference,
  281. address: address
  282. )
  283. tn = DB.client.escape(TABLE)
  284. sql = %Q{
  285. SELECT address_std, lat, lng
  286. FROM `#{tn}`
  287. WHERE council_reference = ? AND address = ?
  288. LIMIT 1
  289. }
  290. begin
  291. row = DB.client.prepare(sql).execute(council_reference, address).first
  292. puts " enriched -> #{row ? row.inspect : 'nil'}"
  293. rescue => e
  294. warn " enriched probe failed: #{e.class} #{e.message}"
  295. end
  296. puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
  297. end
  298. puts "Done #{TABLE}."