launcestoncity.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. # launceston_eproperty.rb
  2. require "nokogiri"
  3. require "uri"
  4. require "fileutils"
  5. require "json"
  6. require "cgi"
  7. require_relative "../lib/http"
  8. require_relative "../lib/db"
  9. require_relative "../lib/util"
  10. require_relative "../lib/enrich"
  11. TABLE = ENV.fetch("TABLE_NAME")
  12. BASE_URL = "https://onlineservice.launceston.tas.gov.au"
  13. URL = ENV.fetch(
  14. "EPROPERTY_URL",
  15. "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
  16. )
  17. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  18. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  19. SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests
  20. HEADERS = {
  21. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
  22. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  23. "Accept-Language" => "en-AU,en;q=0.9",
  24. "Accept-Encoding" => "identity"
  25. }
  26. DB.ensure_table!(TABLE)
  27. COOKIE_HDR = "" # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly"
  28. def merge_set_cookie!(res)
  29. sc = res["set-cookie"]
  30. return if sc.nil? || sc.empty?
  31. # normalize to an array of cookie strings
  32. parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/)
  33. # current cookie hash -> {name => value}
  34. cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h
  35. parts.each do |raw|
  36. kv = raw.split(";", 2).first
  37. name, val = kv.split("=", 2)
  38. next if name.to_s.strip.empty?
  39. cur[name.strip] = val.to_s
  40. end
  41. # rebuild Cookie header (just name=value; name2=value2)
  42. merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ")
  43. Object.send(:remove_const, :COOKIE_HDR) rescue nil
  44. Object.const_set(:COOKIE_HDR, merged)
  45. end
  46. def http_get(url, referer: nil, jar: nil, headers: {})
  47. max_redirects = 5
  48. current_url = url
  49. last_res = nil
  50. loop do
  51. h = HEADERS.merge(headers || {})
  52. h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty?
  53. res = Http.request(
  54. URI.parse(current_url),
  55. headers: h,
  56. jar: (jar || {}), # harmless; we now control cookies explicitly
  57. referer: referer
  58. )
  59. last_res = res
  60. merge_set_cookie!(res) # <-- capture any Set-Cookie
  61. status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  62. loc = res["location"] rescue nil
  63. if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0
  64. referer = current_url
  65. current_url = URI.join(current_url, loc).to_s
  66. next
  67. end
  68. return res.body
  69. end
  70. end
  71. def kv_from_table(tbl)
  72. out = {}
  73. tbl.css("tr").each do |tr|
  74. k = tr.at_css("td.headerColumn")&.text&.strip
  75. v = tr.css("td")[1]&.text&.strip
  76. next if k.nil? || v.nil? || k.empty?
  77. out[k] = v
  78. end
  79. out
  80. end
  81. def absolute(base, href)
  82. return nil if href.to_s.empty?
  83. URI.join(base, href).to_s
  84. rescue URI::InvalidURIError
  85. nil
  86. end
  87. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  88. def filename_from_response(res, fallback)
  89. cd = res["content-disposition"].to_s
  90. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  91. return safe_name($1)
  92. end
  93. base = safe_name(fallback || "document")
  94. ct = res["content-type"].to_s.downcase
  95. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  96. "#{base}#{ext}"
  97. end
  98. def variants_for_doc_list(url)
  99. u = URI.parse(url)
  100. q = URI.decode_www_form(u.query || "").to_h
  101. danum = q["DANUM"]
  102. key = q["KEY"]
  103. # base set (raw + encoded DANUM) on original path
  104. seeds = []
  105. unless danum.nil?
  106. # raw
  107. u_raw = u.dup
  108. u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum))
  109. seeds << u_raw.to_s
  110. # encoded (let encode_www_form do it once)
  111. if danum.include?("/")
  112. u_enc = u.dup
  113. u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum))
  114. seeds << u_enc.to_s
  115. end
  116. else
  117. seeds << u.to_s
  118. end
  119. # path case variants (/PublicNotices/ and /Publicnotices/)
  120. paths = seeds.flat_map do |s|
  121. s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] :
  122. s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s]
  123. end
  124. # remove /P1/ variants
  125. paths2 = paths.flat_map do |s|
  126. s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s]
  127. end
  128. # add route params r & f (common ones for this site)
  129. with_routes = paths2.flat_map do |s|
  130. uri = URI.parse(s)
  131. qq = URI.decode_www_form(uri.query || "").to_h
  132. next [s] if qq.key?("r") && qq.key?("f")
  133. [
  134. s,
  135. begin
  136. uri2 = uri.dup
  137. uri2.query = URI.encode_www_form(qq.merge(
  138. "r" => "P1.LCC.WEBGUEST",
  139. "f" => "$P1.ESB.PUBNOT.VIW"
  140. ))
  141. uri2.to_s
  142. rescue URI::InvalidURIError
  143. s
  144. end
  145. ]
  146. end
  147. with_routes.uniq
  148. end
  149. # ---- update download_doc to accept the shared jar ----
  150. def download_doc(url, referer:, council_reference:, jar:)
  151. dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference))
  152. FileUtils.mkdir_p(dir)
  153. h = { "Cookie" => COOKIE_HDR }.merge(HEADERS) # send the same browser-ish headers
  154. res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer)
  155. merge_set_cookie!(res)
  156. bytes = res.body
  157. fname = filename_from_response(res, File.basename(URI.parse(url).path))
  158. path = File.join(dir, fname)
  159. File.binwrite(path, bytes)
  160. path
  161. end
  162. def probe_common_docs(base_url:, key:, danum:, referer:)
  163. # danum may be URL-encoded; normalise first
  164. danum_raw = CGI.unescape(danum.to_s)
  165. # "DA0324/2025" -> "DA0324-2025"
  166. danum_slug = danum_raw.gsub("/", "-")
  167. names = [
  168. "Advertised plans",
  169. "Advertised Plans",
  170. "Onsite Notice",
  171. "Onsite notice",
  172. "Onsite Notice ", # trailing space variant seen on this site
  173. ]
  174. # Build candidates with percent-encoded filenames (spaces → %20).
  175. # URI.parse rejects bare spaces, so the filename portion must be encoded.
  176. candidates = names.map do |n|
  177. filename = "#{danum_slug} - #{n}.pdf"
  178. encoded = filename.gsub(" ", "%20")
  179. "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}"
  180. end
  181. found = []
  182. candidates.each do |pdf_url|
  183. begin
  184. h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0")
  185. # Use the doclist page itself as referer (some installs care)
  186. res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer)
  187. merge_set_cookie!(res)
  188. code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  189. ct = res["content-type"].to_s.downcase
  190. if (code == 200 || code == 206) && ct.include?("pdf")
  191. local_rel = nil
  192. if DOWNLOAD_ATTACHMENTS
  193. begin
  194. saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR)
  195. local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}"
  196. rescue StandardError => e
  197. warn "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}"
  198. end
  199. end
  200. found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel }
  201. end
  202. rescue StandardError => e
  203. warn "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}"
  204. next
  205. end
  206. end
  207. found
  208. end
  209. html = http_get(URL, jar: SESSION_JAR)
  210. doc = Nokogiri::HTML(html)
  211. tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid")
  212. kept = 0
  213. tables.each do |t|
  214. kv = kv_from_table(t)
  215. council_reference = kv["Application ID"].to_s.strip
  216. description = kv["Application Description"].to_s.strip
  217. address = kv["Property Address"].to_s.strip
  218. closing_raw = kv["Closing Date"].to_s.strip
  219. closing_date = Util.parse_aus_date(closing_raw)
  220. details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href")
  221. info_url = absolute(URL, details_rel)
  222. next if council_reference.empty? || address.empty?
  223. # Base upsert (stores list-page fields; date_received comes from details page later)
  224. DB.upsert(TABLE, {
  225. council_reference: council_reference,
  226. description: description,
  227. address: address,
  228. closing_date: closing_date,
  229. closing_date_raw: closing_raw,
  230. info_url: info_url,
  231. applicant: "",
  232. owner: ""
  233. })
  234. # Enrich from details page + collect documents
  235. if info_url
  236. begin
  237. d_html = http_get(info_url, referer: URL, jar: SESSION_JAR)
  238. d_doc = Nokogiri::HTML(d_html)
  239. # Flatten all key/value grids into a single map
  240. details_kv = {}
  241. d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid|
  242. details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old }
  243. end
  244. applicant_name = details_kv["Applicant Name(s)"].to_s.strip
  245. status_text = details_kv["Status"].to_s.strip
  246. assigned_off = details_kv["Assigned Officer"].to_s.strip
  247. group_text = details_kv["Group"].to_s.strip
  248. category_text = details_kv["Category"].to_s.strip
  249. received_raw = details_kv["Application Received"].to_s.strip
  250. valid_raw = details_kv["Application Valid"].to_s.strip
  251. advertised_raw = details_kv["Advertised On"].to_s.strip
  252. legal_desc = details_kv["Property Legal Description"].to_s.strip
  253. received_date = Util.parse_aus_date(received_raw)
  254. valid_date = Util.parse_aus_date(valid_raw)
  255. advertised_date = Util.parse_aus_date(advertised_raw)
  256. # ---- Document listing page (docget.asp -> PNDocumentList) ----
  257. doc_list_url = nil
  258. # primary selector
  259. if (docget = d_doc.at_css("a[href*='docget.asp']"))
  260. doc_list_url = absolute(info_url, docget["href"])
  261. end
  262. # fallback: some instances link text varies or use different casing/paths
  263. if doc_list_url.nil?
  264. if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]"))
  265. doc_list_url = absolute(info_url, alt["href"])
  266. end
  267. end
  268. documents = [] # [{name:, url:, local_url:}, ...]
  269. if doc_list_url
  270. begin
  271. list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR)
  272. list_doc = Nokogiri::HTML(list_html)
  273. doc_anchors = list_doc.css("#PNDocumentList a")
  274. if doc_anchors.empty?
  275. # Fallbacks (case-insensitive) via XPath:
  276. doc_anchors = list_doc.xpath(
  277. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  278. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  279. )
  280. end
  281. documents = [] if documents.nil?
  282. anchors_added = 0
  283. used_url = nil
  284. probe_done = false # ensure probe_common_docs fires at most once per DA
  285. referers = [
  286. info_url, # details page
  287. URL, # notices list page
  288. "#{BASE_URL}/eProperty/" # root
  289. ]
  290. variants_for_doc_list(doc_list_url).each do |candidate_url|
  291. break if anchors_added > 0
  292. referers.each do |ref|
  293. break if anchors_added > 0
  294. begin
  295. list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR)
  296. list_doc = Nokogiri::HTML(list_html)
  297. # Strict then fallback selectors
  298. doc_anchors = list_doc.css("#PNDocumentList a")
  299. if doc_anchors.empty?
  300. doc_anchors = list_doc.xpath(
  301. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  302. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  303. )
  304. end
  305. doc_anchors.each do |a|
  306. name = a.text.strip
  307. href = absolute(candidate_url, a["href"])
  308. next if href.nil? || (name.empty? && href.to_s.strip.empty?)
  309. local_rel = nil
  310. if DOWNLOAD_ATTACHMENTS
  311. begin
  312. saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR)
  313. local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}"
  314. rescue StandardError => e
  315. warn "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}"
  316. end
  317. end
  318. documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel }
  319. anchors_added += 1
  320. end
  321. # Final fallback: probe known filenames directly (runs at most once per DA)
  322. if anchors_added == 0 && !probe_done
  323. probe_done = true
  324. begin
  325. u = URI.parse(doc_list_url)
  326. q = URI.decode_www_form(u.query || "").to_h
  327. key = q["KEY"]
  328. danum = q["DANUM"] || council_reference
  329. if key && danum
  330. probed = probe_common_docs(
  331. base_url: BASE_URL,
  332. key: key,
  333. danum: danum,
  334. referer: doc_list_url
  335. )
  336. documents.concat(probed)
  337. anchors_added = probed.size if probed.any?
  338. end
  339. rescue StandardError => e
  340. warn "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}"
  341. end
  342. end
  343. if anchors_added > 0
  344. used_url = candidate_url
  345. puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})"
  346. break
  347. else
  348. # Save the first empty response body to inspect (once per app)
  349. begin
  350. dump_dir = "/app/tmp/launceston_doclist_dumps"
  351. FileUtils.mkdir_p(dump_dir)
  352. File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000])
  353. rescue StandardError => e
  354. warn "Failed to write dump for #{council_reference}: #{e.class} #{e.message}"
  355. end
  356. end
  357. rescue StandardError => e
  358. warn "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}"
  359. end
  360. end
  361. end
  362. if used_url.nil?
  363. warn "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}"
  364. end
  365. rescue StandardError => e
  366. warn "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}"
  367. end
  368. end
  369. first_doc_url = documents.first&.dig(:url)
  370. first_local = documents.first&.dig(:local_url)
  371. puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url
  372. puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url
  373. DB.upsert(TABLE, {
  374. # --- always include your base fields again ---
  375. council_reference: council_reference,
  376. description: description,
  377. address: address,
  378. info_url: info_url,
  379. on_notice_to: closing_date,
  380. on_notice_to_raw: closing_raw,
  381. # --- enrich fields from details page ---
  382. applicant: applicant_name,
  383. status: status_text,
  384. assigned_officer: assigned_off,
  385. group: group_text,
  386. category: category_text,
  387. date_received: received_date,
  388. date_received_raw: received_raw,
  389. application_valid: valid_date,
  390. application_valid_raw: valid_raw,
  391. advertised_on: advertised_date,
  392. advertised_on_raw: advertised_raw,
  393. property_legal_description: legal_desc,
  394. # --- documents ---
  395. pdf_url: first_doc_url, # <-- was document_url
  396. local_document_url: first_local, # keep
  397. documents_json: JSON.generate(documents) # full set
  398. })
  399. rescue StandardError => e
  400. warn "Enrich failed for #{council_reference}: #{e.class} #{e.message}"
  401. end
  402. end
  403. enrich_after_upsert!(
  404. table: TABLE,
  405. council_reference: council_reference,
  406. address: address
  407. #info_url: info_url
  408. )
  409. puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})"
  410. kept += 1
  411. end
  412. puts "Done #{TABLE}. Found #{kept}, saved #{kept}."