dorset.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. # scrapers/dorset.rb
  2. require "date"
  3. require "nokogiri"
  4. require "uri"
  5. require "fileutils"
  6. require_relative "../lib/enrich"
  7. require_relative "../lib/log"
  8. require_relative "../lib/util"
  9. TABLE = ENV.fetch("TABLE_NAME")
  10. BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
  11. BASE_HTTP = "http://eservices.dorset.tas.gov.au"
  12. # Pick one
  13. LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
  14. #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
  15. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  16. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  17. DB.ensure_table!(TABLE)
  18. def abs_url(href)
  19. return "" if href.to_s.strip.empty?
  20. URI.join(BASE_HTTPS, href).to_s
  21. rescue URI::InvalidURIError
  22. href.to_s
  23. end
  24. def dorset_get(jar, url)
  25. [BASE_HTTPS, BASE_HTTP].each do |base|
  26. begin
  27. Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
  28. Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
  29. tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
  30. res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  31. if res.is_a?(Net::HTTPRedirection) && res["location"]
  32. res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  33. end
  34. return res if res.is_a?(Net::HTTPSuccess)
  35. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  36. next
  37. end
  38. end
  39. raise "Dorset fetch failed for #{url}"
  40. end
  41. def parse_list(html)
  42. doc = Nokogiri::HTML(html)
  43. out = []
  44. doc.css("h4.non_table_headers a").each do |a|
  45. address = a.text.to_s.strip
  46. href = a["href"].to_s
  47. entry = a.ancestors("h4").first&.next_element
  48. description = ""
  49. date_received_raw = ""
  50. council_reference = ""
  51. applicant = ""
  52. owner = ""
  53. if entry
  54. entry.css(".rowDataOnly").each do |p|
  55. spans = p.css("span")
  56. next unless spans.length == 2
  57. key = spans[0].text.to_s.strip
  58. val = spans[1].text.to_s.strip
  59. case key
  60. when "Type of Work" then description = val
  61. when "Date Lodged" then date_received_raw = val
  62. when "Application No." then council_reference = val
  63. when "Applicant" then applicant = val
  64. when "Owner" then owner = val
  65. end
  66. end
  67. end
  68. lodged_dt = Util.parse_aus_date(date_received_raw)
  69. on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
  70. out << {
  71. address: address,
  72. detail_href: href,
  73. description: description.empty? ? "Development Application" : description,
  74. date_received_raw: date_received_raw,
  75. date_received: lodged_dt,
  76. on_notice_to: on_to_dt,
  77. on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
  78. council_reference: council_reference,
  79. applicant: applicant,
  80. owner: owner
  81. }
  82. end
  83. out
  84. end
  85. def extract_doc_links(detail_html)
  86. doc = Nokogiri::HTML(detail_html)
  87. links = []
  88. t = doc.css('table[summary]').find { |tbl|
  89. tbl["summary"].to_s.downcase.include?("electronic document")
  90. }
  91. if t
  92. links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
  93. end
  94. links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
  95. links.map { |h| abs_url(h) }.uniq
  96. end
  97. # --- new: parse the tasks/milestones table ---
  98. def parse_tasks(detail_html)
  99. doc = Nokogiri::HTML(detail_html)
  100. t = doc.css('table[summary]').find { |tbl|
  101. tbl["summary"].to_s.downcase.include?("tasks associated")
  102. }
  103. return [] unless t
  104. out = []
  105. t.css("tr")[1..]&.each do |tr|
  106. tds = tr.css("td")
  107. next if tds.empty?
  108. stage_desc = tds[1]&.text.to_s.strip
  109. opened_raw = tds[2]&.text.to_s.strip
  110. target_raw = tds[3]&.text.to_s.strip
  111. completed_raw = tds[4]&.text.to_s.strip
  112. status = tds[5]&.text.to_s.strip
  113. out << {
  114. stage_description: stage_desc,
  115. opened_raw: opened_raw,
  116. opened_date: Util.parse_aus_date(opened_raw),
  117. target_raw: target_raw,
  118. target_date: Util.parse_aus_date(target_raw),
  119. completed_raw: completed_raw,
  120. completed_date: Util.parse_aus_date(completed_raw),
  121. status: status
  122. }
  123. end
  124. out
  125. end
  126. def ensure_stages_table!(table)
  127. tn = "#{table}_stages"
  128. DB.client.query(<<~SQL)
  129. CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
  130. id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
  131. council_reference VARCHAR(100) NOT NULL,
  132. address VARCHAR(255) NOT NULL,
  133. stage_description VARCHAR(255) NOT NULL,
  134. opened_date DATE NULL,
  135. opened_raw VARCHAR(50) NULL,
  136. target_date DATE NULL,
  137. target_raw VARCHAR(50) NULL,
  138. completed_date DATE NULL,
  139. completed_raw VARCHAR(50) NULL,
  140. status VARCHAR(100) NULL,
  141. created_at DATETIME NOT NULL,
  142. updated_at DATETIME NOT NULL,
  143. PRIMARY KEY (id),
  144. UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
  145. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
  146. SQL
  147. end
  148. def save_stages(table, ref, addr, stages)
  149. return if stages.empty?
  150. tn = "#{table}_stages"
  151. ensure_stages_table!(table)
  152. DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
  153. .execute(ref, addr)
  154. ins = DB.client.prepare(<<~SQL)
  155. INSERT INTO `#{DB.client.escape(tn)}`
  156. (council_reference, address, stage_description,
  157. opened_date, opened_raw, target_date, target_raw,
  158. completed_date, completed_raw, status, created_at, updated_at)
  159. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
  160. SQL
  161. stages.each do |s|
  162. ins.execute(
  163. ref, addr, s[:stage_description][0,255],
  164. s[:opened_date], s[:opened_raw][0,50],
  165. s[:target_date], s[:target_raw][0,50],
  166. s[:completed_date], s[:completed_raw][0,50],
  167. s[:status][0,100]
  168. )
  169. end
  170. end
  171. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  172. def id_from_url(u)
  173. uri = URI.parse(u)
  174. q = uri.query.to_s
  175. q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
  176. rescue URI::InvalidURIError
  177. nil
  178. end
  179. def filename_from_response(res, fallback_id)
  180. cd = res["content-disposition"].to_s
  181. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  182. return safe_name($1)
  183. end
  184. base = safe_name(fallback_id || "document")
  185. ct = res["content-type"].to_s
  186. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  187. "#{base}#{ext}"
  188. end
  189. def download_all(urls, jar, council_reference)
  190. return [] if urls.empty?
  191. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  192. FileUtils.mkdir_p(dir)
  193. saved = []
  194. first_web_rel = nil
  195. urls.each_with_index do |u, i|
  196. begin
  197. res = dorset_get(jar, u)
  198. body = res.body.to_s
  199. fid = id_from_url(u) || "file#{i+1}"
  200. name = filename_from_response(res, fid)
  201. path = File.join(dir, name)
  202. bytes = File.binwrite(path, body)
  203. puts " saved #{path} (#{bytes} bytes)"
  204. saved << path
  205. first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
  206. rescue StandardError => e
  207. Log.warn "scraper", "Download failed for #{u}: #{e.class} #{e.message}"
  208. end
  209. end
  210. if first_web_rel
  211. begin
  212. DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
  213. .execute(first_web_rel, council_reference)
  214. rescue StandardError => e
  215. Log.warn "scraper", "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
  216. end
  217. end
  218. saved
  219. end
  220. puts "Fetching Dorset list…"
  221. jar = {}
  222. list_res = dorset_get(jar, LIST_URL)
  223. list_html = list_res.body
  224. list_items = parse_list(list_html)
  225. puts "Found #{list_items.length} items for #{TABLE}"
  226. list_items.each do |r|
  227. next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
  228. detail_url = abs_url(r[:detail_href])
  229. doc_urls = []
  230. stages = []
  231. saved_paths = []
  232. if !detail_url.empty?
  233. begin
  234. detail_res = dorset_get(jar, detail_url)
  235. detail_html = detail_res.body
  236. # documents
  237. doc_urls = extract_doc_links(detail_html)
  238. saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
  239. # stages
  240. stages = parse_tasks(detail_html)
  241. # prefer Advertising/Public Notif dates if they exist
  242. if r[:on_notice_to].nil?
  243. adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
  244. if adv
  245. r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
  246. r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
  247. end
  248. end
  249. rescue StandardError => e
  250. Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  251. end
  252. end
  253. representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
  254. # geocode
  255. geo = nil
  256. begin
  257. geo = Geocode.format_au(r[:address])
  258. rescue StandardError => e
  259. Log.warn "scraper", "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
  260. end
  261. council_reference = r[:council_reference][0,100]
  262. address = r[:address][0,255]
  263. # upsert main row
  264. DB.upsert(TABLE, {
  265. description: r[:description],
  266. date_received: r[:date_received],
  267. date_received_raw: r[:date_received_raw],
  268. on_notice_to: r[:on_notice_to],
  269. on_notice_to_raw: r[:on_notice_to_raw],
  270. address: address,
  271. council_reference: council_reference,
  272. applicant: r[:applicant],
  273. owner: r[:owner]
  274. })
  275. enrich_after_upsert!(
  276. table: TABLE,
  277. council_reference: council_reference,
  278. address: address
  279. )
  280. tn = DB.client.escape(TABLE)
  281. sql = %Q{
  282. SELECT address_std, lat, lng
  283. FROM `#{tn}`
  284. WHERE council_reference = ? AND address = ?
  285. LIMIT 1
  286. }
  287. begin
  288. row = DB.client.prepare(sql).execute(council_reference, address).first
  289. puts " enriched -> #{row ? row.inspect : 'nil'}"
  290. rescue StandardError => e
  291. Log.warn "scraper", " enriched probe failed: #{e.class} #{e.message}"
  292. end
  293. puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
  294. end
  295. puts "Done #{TABLE}."