launcestoncity.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. # launceston_eproperty.rb
  2. require "nokogiri"
  3. require "uri"
  4. require "fileutils"
  5. require "json"
  6. require "cgi"
  7. require_relative "../lib/http"
  8. require_relative "../lib/db"
  9. require_relative "../lib/util"
  10. require_relative "../lib/enrich"
  11. TABLE = ENV.fetch("TABLE_NAME")
  12. BASE_URL = "https://onlineservice.launceston.tas.gov.au"
  13. URL = ENV.fetch(
  14. "EPROPERTY_URL",
  15. "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
  16. )
  17. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  18. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  19. SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests
  20. HEADERS = {
  21. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
  22. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  23. "Accept-Language" => "en-AU,en;q=0.9",
  24. "Accept-Encoding" => "identity"
  25. }
  26. DB.ensure_table!(TABLE)
  27. COOKIE_HDR = "" # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly"
  28. def merge_set_cookie!(res)
  29. sc = res["set-cookie"]
  30. return if sc.nil? || sc.empty?
  31. # normalize to an array of cookie strings
  32. parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/)
  33. # current cookie hash -> {name => value}
  34. cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h
  35. parts.each do |raw|
  36. kv = raw.split(";", 2).first
  37. name, val = kv.split("=", 2)
  38. next if name.to_s.strip.empty?
  39. cur[name.strip] = val.to_s
  40. end
  41. # rebuild Cookie header (just name=value; name2=value2)
  42. merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ")
  43. Object.send(:remove_const, :COOKIE_HDR) rescue nil
  44. Object.const_set(:COOKIE_HDR, merged)
  45. end
  46. def http_get(url, referer: nil, jar: nil, headers: {})
  47. max_redirects = 5
  48. current_url = url
  49. last_res = nil
  50. loop do
  51. h = HEADERS.merge(headers || {})
  52. h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty?
  53. res = Http.request(
  54. URI.parse(current_url),
  55. headers: h,
  56. jar: (jar || {}), # harmless; we now control cookies explicitly
  57. referer: referer
  58. )
  59. last_res = res
  60. merge_set_cookie!(res) # <-- capture any Set-Cookie
  61. status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  62. loc = res["location"] rescue nil
  63. if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0
  64. referer = current_url
  65. current_url = URI.join(current_url, loc).to_s
  66. next
  67. end
  68. return res.body
  69. end
  70. end
  71. def kv_from_table(tbl)
  72. out = {}
  73. tbl.css("tr").each do |tr|
  74. k = tr.at_css("td.headerColumn")&.text&.strip
  75. v = tr.css("td")[1]&.text&.strip
  76. next if k.nil? || v.nil? || k.empty?
  77. out[k] = v
  78. end
  79. out
  80. end
  81. def absolute(base, href)
  82. return nil if href.to_s.empty?
  83. URI.join(base, href).to_s
  84. rescue URI::InvalidURIError
  85. nil
  86. end
  87. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  88. def filename_from_response(res, fallback)
  89. cd = res["content-disposition"].to_s
  90. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  91. return safe_name($1)
  92. end
  93. base = safe_name(fallback || "document")
  94. ct = res["content-type"].to_s.downcase
  95. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  96. "#{base}#{ext}"
  97. end
  98. def variants_for_doc_list(url)
  99. u = URI.parse(url)
  100. q = URI.decode_www_form(u.query || "").to_h
  101. danum = q["DANUM"]
  102. key = q["KEY"]
  103. # base set (raw + encoded DANUM) on original path
  104. seeds = []
  105. unless danum.nil?
  106. # raw
  107. u_raw = u.dup
  108. u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum))
  109. seeds << u_raw.to_s
  110. # encoded (let encode_www_form do it once)
  111. if danum.include?("/")
  112. u_enc = u.dup
  113. u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum))
  114. seeds << u_enc.to_s
  115. end
  116. else
  117. seeds << u.to_s
  118. end
  119. # path case variants (/PublicNotices/ and /Publicnotices/)
  120. paths = seeds.flat_map do |s|
  121. s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] :
  122. s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s]
  123. end
  124. # remove /P1/ variants
  125. paths2 = paths.flat_map do |s|
  126. s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s]
  127. end
  128. # add route params r & f (common ones for this site)
  129. with_routes = paths2.flat_map do |s|
  130. uri = URI.parse(s)
  131. qq = URI.decode_www_form(uri.query || "").to_h
  132. next [s] if qq.key?("r") && qq.key?("f")
  133. [
  134. s,
  135. begin
  136. uri2 = uri.dup
  137. uri2.query = URI.encode_www_form(qq.merge(
  138. "r" => "P1.LCC.WEBGUEST",
  139. "f" => "$P1.ESB.PUBNOT.VIW"
  140. ))
  141. uri2.to_s
  142. rescue URI::InvalidURIError
  143. s
  144. end
  145. ]
  146. end
  147. with_routes.uniq
  148. end
  149. # ---- update download_doc to accept the shared jar ----
  150. def download_doc(url, referer:, council_reference:, jar:)
  151. dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference))
  152. FileUtils.mkdir_p(dir)
  153. h = { "Cookie" => COOKIE_HDR }.merge(HEADERS) # send the same browser-ish headers
  154. res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer)
  155. merge_set_cookie!(res)
  156. bytes = res.body
  157. fname = filename_from_response(res, File.basename(URI.parse(url).path))
  158. path = File.join(dir, fname)
  159. File.binwrite(path, bytes)
  160. path
  161. end
  162. def probe_common_docs(base_url:, key:, danum:, referer:)
  163. # danum may be URL-encoded; normalise first
  164. danum_raw = CGI.unescape(danum.to_s)
  165. # "DA0324/2025" -> "DA0324-2025"
  166. danum_slug = danum_raw.gsub("/", "-")
  167. names = [
  168. "Advertised plans",
  169. "Advertised Plans",
  170. "Onsite Notice",
  171. "Onsite notice",
  172. "Onsite Notice ", # trailing space variant seen on this site
  173. ]
  174. prefix = "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{danum_slug} - "
  175. candidates = names.map { |n| "#{prefix}#{n}.pdf" }
  176. found = []
  177. candidates.each do |pdf_url|
  178. begin
  179. h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0")
  180. # Use the doclist page itself as referer (some installs care)
  181. res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer)
  182. merge_set_cookie!(res)
  183. code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  184. ct = res["content-type"].to_s.downcase
  185. if (code == 200 || code == 206) && ct.include?("pdf")
  186. local_rel = nil
  187. if DOWNLOAD_ATTACHMENTS
  188. begin
  189. saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR)
  190. local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}"
  191. rescue => e
  192. warn "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}"
  193. end
  194. end
  195. found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel }
  196. end
  197. rescue StandardError => e
  198. warn "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}"
  199. next
  200. end
  201. end
  202. found
  203. end
  204. html = http_get(URL, jar: SESSION_JAR)
  205. doc = Nokogiri::HTML(html)
  206. tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid")
  207. kept = 0
  208. tables.each do |t|
  209. kv = kv_from_table(t)
  210. council_reference = kv["Application ID"].to_s.strip
  211. description = kv["Application Description"].to_s.strip
  212. address = kv["Property Address"].to_s.strip
  213. closing_raw = kv["Closing Date"].to_s.strip
  214. closing_date = Util.parse_aus_date(closing_raw)
  215. details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href")
  216. info_url = absolute(URL, details_rel)
  217. next if council_reference.empty? || address.empty?
  218. # Base upsert (stores list-page fields; date_received comes from details page later)
  219. DB.upsert(TABLE, {
  220. council_reference: council_reference,
  221. description: description,
  222. address: address,
  223. closing_date: closing_date,
  224. closing_date_raw: closing_raw,
  225. info_url: info_url,
  226. applicant: "",
  227. owner: ""
  228. })
  229. # Enrich from details page + collect documents
  230. if info_url
  231. begin
  232. d_html = http_get(info_url, referer: URL, jar: SESSION_JAR)
  233. d_doc = Nokogiri::HTML(d_html)
  234. # Flatten all key/value grids into a single map
  235. details_kv = {}
  236. d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid|
  237. details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old }
  238. end
  239. applicant_name = details_kv["Applicant Name(s)"].to_s.strip
  240. status_text = details_kv["Status"].to_s.strip
  241. assigned_off = details_kv["Assigned Officer"].to_s.strip
  242. group_text = details_kv["Group"].to_s.strip
  243. category_text = details_kv["Category"].to_s.strip
  244. received_raw = details_kv["Application Received"].to_s.strip
  245. valid_raw = details_kv["Application Valid"].to_s.strip
  246. advertised_raw = details_kv["Advertised On"].to_s.strip
  247. legal_desc = details_kv["Property Legal Description"].to_s.strip
  248. received_date = Util.parse_aus_date(received_raw)
  249. valid_date = Util.parse_aus_date(valid_raw)
  250. advertised_date = Util.parse_aus_date(advertised_raw)
  251. # ---- Document listing page (docget.asp -> PNDocumentList) ----
  252. doc_list_url = nil
  253. # primary selector
  254. if (docget = d_doc.at_css("a[href*='docget.asp']"))
  255. doc_list_url = absolute(info_url, docget["href"])
  256. end
  257. # fallback: some instances link text varies or use different casing/paths
  258. if doc_list_url.nil?
  259. if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]"))
  260. doc_list_url = absolute(info_url, alt["href"])
  261. end
  262. end
  263. documents = [] # [{name:, url:, local_url:}, ...]
  264. if doc_list_url
  265. begin
  266. list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR)
  267. list_doc = Nokogiri::HTML(list_html)
  268. doc_anchors = list_doc.css("#PNDocumentList a")
  269. if doc_anchors.empty?
  270. # Fallbacks (case-insensitive) via XPath:
  271. doc_anchors = list_doc.xpath(
  272. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  273. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  274. )
  275. end
  276. documents = [] if documents.nil?
  277. anchors_added = 0
  278. used_url = nil
  279. referers = [
  280. info_url, # details page
  281. URL, # notices list page
  282. "#{BASE_URL}/eProperty/" # root
  283. ]
  284. variants_for_doc_list(doc_list_url).each do |candidate_url|
  285. break if anchors_added > 0
  286. referers.each do |ref|
  287. break if anchors_added > 0
  288. begin
  289. list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR)
  290. list_doc = Nokogiri::HTML(list_html)
  291. # Strict then fallback selectors
  292. doc_anchors = list_doc.css("#PNDocumentList a")
  293. if doc_anchors.empty?
  294. doc_anchors = list_doc.xpath(
  295. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  296. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  297. )
  298. end
  299. doc_anchors.each do |a|
  300. name = a.text.strip
  301. href = absolute(candidate_url, a["href"])
  302. next if href.nil? || (name.empty? && href.to_s.strip.empty?)
  303. local_rel = nil
  304. if DOWNLOAD_ATTACHMENTS
  305. begin
  306. saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR)
  307. local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}"
  308. rescue => e
  309. warn "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}"
  310. end
  311. end
  312. documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel }
  313. anchors_added += 1
  314. end
  315. # Regex fallback
  316. if anchors_added == 0
  317. # Final fallback: probe known filenames directly
  318. # Extract KEY and DANUM from the original doc_list_url
  319. begin
  320. u = URI.parse(doc_list_url)
  321. q = URI.decode_www_form(u.query || "").to_h
  322. key = q["KEY"]
  323. danum = q["DANUM"] || council_reference
  324. if key && danum
  325. probed = probe_common_docs(
  326. base_url: BASE_URL,
  327. key: key,
  328. danum: danum,
  329. referer: doc_list_url # better context for this server
  330. )
  331. documents.concat(probed)
  332. anchors_added = probed.size if probed.any?
  333. end
  334. rescue => e
  335. warn "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}"
  336. end
  337. if anchors_added == 0
  338. warn "Docs page had no usable links for #{council_reference} after variants."
  339. end
  340. end
  341. if anchors_added > 0
  342. used_url = candidate_url
  343. puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})"
  344. break
  345. else
  346. # Save the first empty response body to inspect (once per app)
  347. begin
  348. dump_dir = "/app/tmp/launceston_doclist_dumps"
  349. FileUtils.mkdir_p(dump_dir)
  350. File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000])
  351. rescue => e
  352. warn "Failed to write dump for #{council_reference}: #{e.class} #{e.message}"
  353. end
  354. end
  355. rescue => e
  356. warn "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}"
  357. end
  358. end
  359. end
  360. if used_url.nil?
  361. warn "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}"
  362. end
  363. rescue => e
  364. warn "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}"
  365. end
  366. end
  367. first_doc_url = documents.first&.dig(:url)
  368. first_local = documents.first&.dig(:local_url)
  369. puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url
  370. puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url
  371. DB.upsert(TABLE, {
  372. # --- always include your base fields again ---
  373. council_reference: council_reference,
  374. description: description,
  375. address: address,
  376. info_url: info_url,
  377. on_notice_to: closing_date,
  378. on_notice_to_raw: closing_raw,
  379. # --- enrich fields from details page ---
  380. applicant: applicant_name,
  381. status: status_text,
  382. assigned_officer: assigned_off,
  383. group: group_text,
  384. category: category_text,
  385. date_received: received_date,
  386. date_received_raw: received_raw,
  387. application_valid: valid_date,
  388. application_valid_raw: valid_raw,
  389. advertised_on: advertised_date,
  390. advertised_on_raw: advertised_raw,
  391. property_legal_description: legal_desc,
  392. # --- documents ---
  393. pdf_url: first_doc_url, # <-- was document_url
  394. local_document_url: first_local, # keep
  395. documents_json: JSON.generate(documents) # full set
  396. })
  397. rescue => e
  398. warn "Enrich failed for #{council_reference}: #{e.class} #{e.message}"
  399. end
  400. end
  401. enrich_after_upsert!(
  402. table: TABLE,
  403. council_reference: council_reference,
  404. address: address
  405. #info_url: info_url
  406. )
  407. puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})"
  408. kept += 1
  409. end
  410. puts "Done #{TABLE}. Found #{kept}, saved #{kept}."