dorset.rb 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. # scrapers/dorset.rb
  2. require "date"
  3. require "nokogiri"
  4. require "uri"
  5. require "fileutils"
  6. require_relative "../lib/enrich"
  7. require_relative "../lib/log"
  8. TABLE = ENV.fetch("TABLE_NAME")
  9. BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
  10. BASE_HTTP = "http://eservices.dorset.tas.gov.au"
  11. # Pick one
  12. LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
  13. #LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
  14. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  15. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  16. DB.ensure_table!(TABLE)
  17. def abs_url(href)
  18. return "" if href.to_s.strip.empty?
  19. URI.join(BASE_HTTPS, href).to_s
  20. rescue URI::InvalidURIError
  21. href.to_s
  22. end
  23. def dorset_get(jar, url)
  24. [BASE_HTTPS, BASE_HTTP].each do |base|
  25. begin
  26. Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
  27. Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
  28. tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
  29. res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
  30. if res.is_a?(Net::HTTPRedirection) && res["location"]
  31. res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
  32. end
  33. return res if res.is_a?(Net::HTTPSuccess)
  34. rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
  35. next
  36. end
  37. end
  38. raise "Dorset fetch failed for #{url}"
  39. end
  40. def parse_list(html)
  41. doc = Nokogiri::HTML(html)
  42. out = []
  43. doc.css("h4.non_table_headers a").each do |a|
  44. address = a.text.to_s.strip
  45. href = a["href"].to_s
  46. entry = a.ancestors("h4").first&.next_element
  47. description = ""
  48. date_received_raw = ""
  49. council_reference = ""
  50. applicant = ""
  51. owner = ""
  52. if entry
  53. entry.css(".rowDataOnly").each do |p|
  54. spans = p.css("span")
  55. next unless spans.length == 2
  56. key = spans[0].text.to_s.strip
  57. val = spans[1].text.to_s.strip
  58. case key
  59. when "Type of Work" then description = val
  60. when "Date Lodged" then date_received_raw = val
  61. when "Application No." then council_reference = val
  62. when "Applicant" then applicant = val
  63. when "Owner" then owner = val
  64. end
  65. end
  66. end
  67. lodged_dt = Util.parse_aus_date(date_received_raw)
  68. on_to_dt = lodged_dt ? (lodged_dt + 14) : nil
  69. out << {
  70. address: address,
  71. detail_href: href,
  72. description: description.empty? ? "Development Application" : description,
  73. date_received_raw: date_received_raw,
  74. date_received: lodged_dt,
  75. on_notice_to: on_to_dt,
  76. on_notice_to_raw: on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
  77. council_reference: council_reference,
  78. applicant: applicant,
  79. owner: owner
  80. }
  81. end
  82. out
  83. end
  84. def extract_doc_links(detail_html)
  85. doc = Nokogiri::HTML(detail_html)
  86. links = []
  87. t = doc.css('table[summary]').find { |tbl|
  88. tbl["summary"].to_s.downcase.include?("electronic document")
  89. }
  90. if t
  91. links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
  92. end
  93. links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
  94. links.map { |h| abs_url(h) }.uniq
  95. end
  96. # --- new: parse the tasks/milestones table ---
  97. def parse_tasks(detail_html)
  98. doc = Nokogiri::HTML(detail_html)
  99. t = doc.css('table[summary]').find { |tbl|
  100. tbl["summary"].to_s.downcase.include?("tasks associated")
  101. }
  102. return [] unless t
  103. out = []
  104. t.css("tr")[1..]&.each do |tr|
  105. tds = tr.css("td")
  106. next if tds.empty?
  107. stage_desc = tds[1]&.text.to_s.strip
  108. opened_raw = tds[2]&.text.to_s.strip
  109. target_raw = tds[3]&.text.to_s.strip
  110. completed_raw = tds[4]&.text.to_s.strip
  111. status = tds[5]&.text.to_s.strip
  112. out << {
  113. stage_description: stage_desc,
  114. opened_raw: opened_raw,
  115. opened_date: Util.parse_aus_date(opened_raw),
  116. target_raw: target_raw,
  117. target_date: Util.parse_aus_date(target_raw),
  118. completed_raw: completed_raw,
  119. completed_date: Util.parse_aus_date(completed_raw),
  120. status: status
  121. }
  122. end
  123. out
  124. end
  125. def ensure_stages_table!(table)
  126. tn = "#{table}_stages"
  127. DB.client.query(<<~SQL)
  128. CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
  129. id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
  130. council_reference VARCHAR(100) NOT NULL,
  131. address VARCHAR(255) NOT NULL,
  132. stage_description VARCHAR(255) NOT NULL,
  133. opened_date DATE NULL,
  134. opened_raw VARCHAR(50) NULL,
  135. target_date DATE NULL,
  136. target_raw VARCHAR(50) NULL,
  137. completed_date DATE NULL,
  138. completed_raw VARCHAR(50) NULL,
  139. status VARCHAR(100) NULL,
  140. created_at DATETIME NOT NULL,
  141. updated_at DATETIME NOT NULL,
  142. PRIMARY KEY (id),
  143. UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
  144. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
  145. SQL
  146. end
  147. def save_stages(table, ref, addr, stages)
  148. return if stages.empty?
  149. tn = "#{table}_stages"
  150. ensure_stages_table!(table)
  151. DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
  152. .execute(ref, addr)
  153. ins = DB.client.prepare(<<~SQL)
  154. INSERT INTO `#{DB.client.escape(tn)}`
  155. (council_reference, address, stage_description,
  156. opened_date, opened_raw, target_date, target_raw,
  157. completed_date, completed_raw, status, created_at, updated_at)
  158. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
  159. SQL
  160. stages.each do |s|
  161. ins.execute(
  162. ref, addr, s[:stage_description][0,255],
  163. s[:opened_date], s[:opened_raw][0,50],
  164. s[:target_date], s[:target_raw][0,50],
  165. s[:completed_date], s[:completed_raw][0,50],
  166. s[:status][0,100]
  167. )
  168. end
  169. end
  170. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  171. def id_from_url(u)
  172. uri = URI.parse(u)
  173. q = uri.query.to_s
  174. q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
  175. rescue URI::InvalidURIError
  176. nil
  177. end
  178. def filename_from_response(res, fallback_id)
  179. cd = res["content-disposition"].to_s
  180. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  181. return safe_name($1)
  182. end
  183. base = safe_name(fallback_id || "document")
  184. ct = res["content-type"].to_s
  185. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  186. "#{base}#{ext}"
  187. end
  188. def download_all(urls, jar, council_reference)
  189. return [] if urls.empty?
  190. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  191. FileUtils.mkdir_p(dir)
  192. saved = []
  193. first_web_rel = nil
  194. urls.each_with_index do |u, i|
  195. begin
  196. res = dorset_get(jar, u)
  197. body = res.body.to_s
  198. fid = id_from_url(u) || "file#{i+1}"
  199. name = filename_from_response(res, fid)
  200. path = File.join(dir, name)
  201. bytes = File.binwrite(path, body)
  202. puts " saved #{path} (#{bytes} bytes)"
  203. saved << path
  204. first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
  205. rescue StandardError => e
  206. Log.warn "scraper", "Download failed for #{u}: #{e.class} #{e.message}"
  207. end
  208. end
  209. if first_web_rel
  210. begin
  211. DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
  212. .execute(first_web_rel, council_reference)
  213. rescue StandardError => e
  214. Log.warn "scraper", "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
  215. end
  216. end
  217. saved
  218. end
  219. puts "Fetching Dorset list…"
  220. jar = {}
  221. list_res = dorset_get(jar, LIST_URL)
  222. list_html = list_res.body
  223. list_items = parse_list(list_html)
  224. puts "Found #{list_items.length} items for #{TABLE}"
  225. list_items.each do |r|
  226. next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
  227. detail_url = abs_url(r[:detail_href])
  228. doc_urls = []
  229. stages = []
  230. saved_paths = []
  231. if !detail_url.empty?
  232. begin
  233. detail_res = dorset_get(jar, detail_url)
  234. detail_html = detail_res.body
  235. # documents
  236. doc_urls = extract_doc_links(detail_html)
  237. saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
  238. # stages
  239. stages = parse_tasks(detail_html)
  240. # prefer Advertising/Public Notif dates if they exist
  241. if r[:on_notice_to].nil?
  242. adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
  243. if adv
  244. r[:on_notice_to] = adv[:completed_date] || adv[:target_date]
  245. r[:on_notice_to_raw] = adv[:completed_raw] || adv[:target_raw]
  246. end
  247. end
  248. rescue StandardError => e
  249. Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  250. end
  251. end
  252. representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
  253. # geocode
  254. geo = nil
  255. begin
  256. geo = Geocode.format_au(r[:address])
  257. rescue StandardError => e
  258. Log.warn "scraper", "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
  259. end
  260. council_reference = r[:council_reference][0,100]
  261. address = r[:address][0,255]
  262. # upsert main row
  263. DB.upsert(TABLE, {
  264. description: r[:description],
  265. date_received: r[:date_received],
  266. date_received_raw: r[:date_received_raw],
  267. on_notice_to: r[:on_notice_to],
  268. on_notice_to_raw: r[:on_notice_to_raw],
  269. address: address,
  270. council_reference: council_reference,
  271. applicant: r[:applicant],
  272. owner: r[:owner]
  273. })
  274. enrich_after_upsert!(
  275. table: TABLE,
  276. council_reference: council_reference,
  277. address: address
  278. )
  279. tn = DB.client.escape(TABLE)
  280. sql = %Q{
  281. SELECT address_std, lat, lng
  282. FROM `#{tn}`
  283. WHERE council_reference = ? AND address = ?
  284. LIMIT 1
  285. }
  286. begin
  287. row = DB.client.prepare(sql).execute(council_reference, address).first
  288. puts " enriched -> #{row ? row.inspect : 'nil'}"
  289. rescue StandardError => e
  290. Log.warn "scraper", " enriched probe failed: #{e.class} #{e.message}"
  291. end
  292. puts "Upserted #{r[:council_reference]} -> #{r[:address]} docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
  293. end
  294. puts "Done #{TABLE}."