launcestoncity.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. # launceston_eproperty.rb
  2. require "nokogiri"
  3. require "uri"
  4. require "fileutils"
  5. require "json"
  6. require "cgi"
  7. require_relative "../lib/enrich"
  8. require_relative "../lib/log"
  9. require_relative "../lib/util"
  10. TABLE = ENV.fetch("TABLE_NAME")
  11. BASE_URL = "https://onlineservice.launceston.tas.gov.au"
  12. URL = ENV.fetch(
  13. "EPROPERTY_URL",
  14. "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
  15. )
  16. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  17. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  18. SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests
  19. HEADERS = {
  20. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
  21. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  22. "Accept-Language" => "en-AU,en;q=0.9",
  23. "Accept-Encoding" => "identity"
  24. }
  25. DB.ensure_table!(TABLE)
  26. COOKIE_HDR = "" # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly"
  27. def merge_set_cookie!(res)
  28. sc = res["set-cookie"]
  29. return if sc.nil? || sc.empty?
  30. # normalize to an array of cookie strings
  31. parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/)
  32. # current cookie hash -> {name => value}
  33. cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h
  34. parts.each do |raw|
  35. kv = raw.split(";", 2).first
  36. name, val = kv.split("=", 2)
  37. next if name.to_s.strip.empty?
  38. cur[name.strip] = val.to_s
  39. end
  40. # rebuild Cookie header (just name=value; name2=value2)
  41. merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ")
  42. Object.send(:remove_const, :COOKIE_HDR) rescue nil
  43. Object.const_set(:COOKIE_HDR, merged)
  44. end
  45. def http_get(url, referer: nil, jar: nil, headers: {})
  46. max_redirects = 5
  47. current_url = url
  48. last_res = nil
  49. loop do
  50. h = HEADERS.merge(headers || {})
  51. h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty?
  52. res = Http.request(
  53. URI.parse(current_url),
  54. headers: h,
  55. jar: (jar || {}), # harmless; we now control cookies explicitly
  56. referer: referer
  57. )
  58. last_res = res
  59. merge_set_cookie!(res) # <-- capture any Set-Cookie
  60. status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  61. loc = res["location"] rescue nil
  62. if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0
  63. referer = current_url
  64. current_url = URI.join(current_url, loc).to_s
  65. next
  66. end
  67. return res.body
  68. end
  69. end
  70. def kv_from_table(tbl)
  71. out = {}
  72. tbl.css("tr").each do |tr|
  73. k = tr.at_css("td.headerColumn")&.text&.strip
  74. v = tr.css("td")[1]&.text&.strip
  75. next if k.nil? || v.nil? || k.empty?
  76. out[k] = v
  77. end
  78. out
  79. end
  80. def absolute(base, href)
  81. return nil if href.to_s.empty?
  82. URI.join(base, href).to_s
  83. rescue URI::InvalidURIError
  84. nil
  85. end
  86. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  87. def filename_from_response(res, fallback)
  88. cd = res["content-disposition"].to_s
  89. if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
  90. return safe_name($1)
  91. end
  92. base = safe_name(fallback || "document")
  93. ct = res["content-type"].to_s.downcase
  94. ext = ct.include?("pdf") ? ".pdf" : ".bin"
  95. "#{base}#{ext}"
  96. end
  97. def variants_for_doc_list(url)
  98. u = URI.parse(url)
  99. q = URI.decode_www_form(u.query || "").to_h
  100. danum = q["DANUM"]
  101. key = q["KEY"]
  102. # base set (raw + encoded DANUM) on original path
  103. seeds = []
  104. unless danum.nil?
  105. # raw
  106. u_raw = u.dup
  107. u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum))
  108. seeds << u_raw.to_s
  109. # encoded (let encode_www_form do it once)
  110. if danum.include?("/")
  111. u_enc = u.dup
  112. u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum))
  113. seeds << u_enc.to_s
  114. end
  115. else
  116. seeds << u.to_s
  117. end
  118. # path case variants (/PublicNotices/ and /Publicnotices/)
  119. paths = seeds.flat_map do |s|
  120. s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] :
  121. s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s]
  122. end
  123. # remove /P1/ variants
  124. paths2 = paths.flat_map do |s|
  125. s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s]
  126. end
  127. # add route params r & f (common ones for this site)
  128. with_routes = paths2.flat_map do |s|
  129. uri = URI.parse(s)
  130. qq = URI.decode_www_form(uri.query || "").to_h
  131. next [s] if qq.key?("r") && qq.key?("f")
  132. [
  133. s,
  134. begin
  135. uri2 = uri.dup
  136. uri2.query = URI.encode_www_form(qq.merge(
  137. "r" => "P1.LCC.WEBGUEST",
  138. "f" => "$P1.ESB.PUBNOT.VIW"
  139. ))
  140. uri2.to_s
  141. rescue URI::InvalidURIError
  142. s
  143. end
  144. ]
  145. end
  146. with_routes.uniq
  147. end
  148. # ---- update download_doc to accept the shared jar ----
  149. def download_doc(url, referer:, council_reference:, jar:)
  150. dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference))
  151. FileUtils.mkdir_p(dir)
  152. h = { "Cookie" => COOKIE_HDR }.merge(HEADERS) # send the same browser-ish headers
  153. res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer)
  154. merge_set_cookie!(res)
  155. bytes = res.body
  156. fname = filename_from_response(res, File.basename(URI.parse(url).path))
  157. path = File.join(dir, fname)
  158. File.binwrite(path, bytes)
  159. path
  160. end
  161. def probe_common_docs(base_url:, key:, danum:, referer:)
  162. # danum may be URL-encoded; normalise first
  163. danum_raw = CGI.unescape(danum.to_s)
  164. # "DA0324/2025" -> "DA0324-2025"
  165. danum_slug = danum_raw.gsub("/", "-")
  166. names = [
  167. "Advertised plans",
  168. "Advertised Plans",
  169. "Onsite Notice",
  170. "Onsite notice",
  171. "Onsite Notice ", # trailing space variant seen on this site
  172. ]
  173. # Build candidates with percent-encoded filenames (spaces → %20).
  174. # URI.parse rejects bare spaces, so the filename portion must be encoded.
  175. candidates = names.map do |n|
  176. filename = "#{danum_slug} - #{n}.pdf"
  177. encoded = filename.gsub(" ", "%20")
  178. "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}"
  179. end
  180. found = []
  181. candidates.each do |pdf_url|
  182. begin
  183. h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0")
  184. # Use the doclist page itself as referer (some installs care)
  185. res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer)
  186. merge_set_cookie!(res)
  187. code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
  188. ct = res["content-type"].to_s.downcase
  189. if (code == 200 || code == 206) && ct.include?("pdf")
  190. local_rel = nil
  191. if DOWNLOAD_ATTACHMENTS
  192. begin
  193. saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR)
  194. local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}"
  195. rescue StandardError => e
  196. Log.warn "scraper", "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}"
  197. end
  198. end
  199. found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel }
  200. end
  201. rescue StandardError => e
  202. Log.warn "scraper", "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}"
  203. next
  204. end
  205. end
  206. found
  207. end
  208. html = http_get(URL, jar: SESSION_JAR)
  209. doc = Nokogiri::HTML(html)
  210. tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid")
  211. kept = 0
  212. tables.each do |t|
  213. kv = kv_from_table(t)
  214. council_reference = kv["Application ID"].to_s.strip
  215. description = kv["Application Description"].to_s.strip
  216. address = kv["Property Address"].to_s.strip
  217. closing_raw = kv["Closing Date"].to_s.strip
  218. closing_date = Util.parse_aus_date(closing_raw)
  219. details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href")
  220. info_url = absolute(URL, details_rel)
  221. next if council_reference.empty? || address.empty?
  222. # Base upsert (stores list-page fields; date_received comes from details page later)
  223. DB.upsert(TABLE, {
  224. council_reference: council_reference,
  225. description: description,
  226. address: address,
  227. on_notice_to: closing_date,
  228. on_notice_to_raw: closing_raw,
  229. applicant: "",
  230. owner: ""
  231. })
  232. # Enrich from details page + collect documents
  233. if info_url
  234. begin
  235. d_html = http_get(info_url, referer: URL, jar: SESSION_JAR)
  236. d_doc = Nokogiri::HTML(d_html)
  237. # Flatten all key/value grids into a single map
  238. details_kv = {}
  239. d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid|
  240. details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old }
  241. end
  242. applicant_name = details_kv["Applicant Name(s)"].to_s.strip
  243. status_text = details_kv["Status"].to_s.strip
  244. assigned_off = details_kv["Assigned Officer"].to_s.strip
  245. group_text = details_kv["Group"].to_s.strip
  246. category_text = details_kv["Category"].to_s.strip
  247. received_raw = details_kv["Application Received"].to_s.strip
  248. valid_raw = details_kv["Application Valid"].to_s.strip
  249. advertised_raw = details_kv["Advertised On"].to_s.strip
  250. legal_desc = details_kv["Property Legal Description"].to_s.strip
  251. received_date = Util.parse_aus_date(received_raw)
  252. valid_date = Util.parse_aus_date(valid_raw)
  253. advertised_date = Util.parse_aus_date(advertised_raw)
  254. # ---- Document listing page (docget.asp -> PNDocumentList) ----
  255. doc_list_url = nil
  256. # primary selector
  257. if (docget = d_doc.at_css("a[href*='docget.asp']"))
  258. doc_list_url = absolute(info_url, docget["href"])
  259. end
  260. # fallback: some instances link text varies or use different casing/paths
  261. if doc_list_url.nil?
  262. if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]"))
  263. doc_list_url = absolute(info_url, alt["href"])
  264. end
  265. end
  266. documents = [] # [{name:, url:, local_url:}, ...]
  267. if doc_list_url
  268. begin
  269. list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR)
  270. list_doc = Nokogiri::HTML(list_html)
  271. doc_anchors = list_doc.css("#PNDocumentList a")
  272. if doc_anchors.empty?
  273. # Fallbacks (case-insensitive) via XPath:
  274. doc_anchors = list_doc.xpath(
  275. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  276. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  277. )
  278. end
  279. documents = [] if documents.nil?
  280. anchors_added = 0
  281. used_url = nil
  282. probe_done = false # ensure probe_common_docs fires at most once per DA
  283. referers = [
  284. info_url, # details page
  285. URL, # notices list page
  286. "#{BASE_URL}/eProperty/" # root
  287. ]
  288. variants_for_doc_list(doc_list_url).each do |candidate_url|
  289. break if anchors_added > 0
  290. referers.each do |ref|
  291. break if anchors_added > 0
  292. begin
  293. list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR)
  294. list_doc = Nokogiri::HTML(list_html)
  295. # Strict then fallback selectors
  296. doc_anchors = list_doc.css("#PNDocumentList a")
  297. if doc_anchors.empty?
  298. doc_anchors = list_doc.xpath(
  299. "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
  300. "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
  301. )
  302. end
  303. doc_anchors.each do |a|
  304. name = a.text.strip
  305. href = absolute(candidate_url, a["href"])
  306. next if href.nil? || (name.empty? && href.to_s.strip.empty?)
  307. local_rel = nil
  308. if DOWNLOAD_ATTACHMENTS
  309. begin
  310. saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR)
  311. local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}"
  312. rescue StandardError => e
  313. Log.warn "scraper", "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}"
  314. end
  315. end
  316. documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel }
  317. anchors_added += 1
  318. end
  319. # Final fallback: probe known filenames directly (runs at most once per DA)
  320. if anchors_added == 0 && !probe_done
  321. probe_done = true
  322. begin
  323. u = URI.parse(doc_list_url)
  324. q = URI.decode_www_form(u.query || "").to_h
  325. key = q["KEY"]
  326. danum = q["DANUM"] || council_reference
  327. if key && danum
  328. probed = probe_common_docs(
  329. base_url: BASE_URL,
  330. key: key,
  331. danum: danum,
  332. referer: doc_list_url
  333. )
  334. documents.concat(probed)
  335. anchors_added = probed.size if probed.any?
  336. end
  337. rescue StandardError => e
  338. Log.warn "scraper", "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}"
  339. end
  340. end
  341. if anchors_added > 0
  342. used_url = candidate_url
  343. puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})"
  344. break
  345. else
  346. # Save the first empty response body to inspect (once per app)
  347. begin
  348. dump_dir = "/app/tmp/launceston_doclist_dumps"
  349. FileUtils.mkdir_p(dump_dir)
  350. File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000])
  351. rescue StandardError => e
  352. Log.warn "scraper", "Failed to write dump for #{council_reference}: #{e.class} #{e.message}"
  353. end
  354. end
  355. rescue StandardError => e
  356. Log.warn "scraper", "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}"
  357. end
  358. end
  359. end
  360. if used_url.nil?
  361. Log.warn "scraper", "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}"
  362. end
  363. rescue StandardError => e
  364. Log.warn "scraper", "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}"
  365. end
  366. end
  367. first_doc_url = documents.first&.dig(:url)
  368. first_local = documents.first&.dig(:local_url)
  369. puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url
  370. puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url
  371. DB.upsert(TABLE, {
  372. council_reference: council_reference,
  373. address: address,
  374. applicant: applicant_name,
  375. date_received: received_date,
  376. date_received_raw: received_raw,
  377. document_url: first_doc_url,
  378. local_document_url: first_local,
  379. documents_json: documents.empty? ? nil : JSON.generate(documents)
  380. })
  381. rescue StandardError => e
  382. Log.warn "scraper", "Enrich failed for #{council_reference}: #{e.class} #{e.message}"
  383. end
  384. end
  385. enrich_after_upsert!(
  386. table: TABLE,
  387. council_reference: council_reference,
  388. address: address
  389. #info_url: info_url
  390. )
  391. puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})"
  392. kept += 1
  393. end
  394. puts "Done #{TABLE}. Saved #{kept} item(s)."