| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487 |
- # launceston_eproperty.rb
- require "nokogiri"
- require "uri"
- require "fileutils"
- require "json"
- require "cgi"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/enrich"
- TABLE = ENV.fetch("TABLE_NAME")
- BASE_URL = "https://onlineservice.launceston.tas.gov.au"
- URL = ENV.fetch(
- "EPROPERTY_URL",
- "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
- )
- DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests
- HEADERS = {
- "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language" => "en-AU,en;q=0.9",
- "Accept-Encoding" => "identity"
- }
- DB.ensure_table!(TABLE)
- COOKIE_HDR = "" # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly"
- def merge_set_cookie!(res)
- sc = res["set-cookie"]
- return if sc.nil? || sc.empty?
- # normalize to an array of cookie strings
- parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/)
- # current cookie hash -> {name => value}
- cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h
- parts.each do |raw|
- kv = raw.split(";", 2).first
- name, val = kv.split("=", 2)
- next if name.to_s.strip.empty?
- cur[name.strip] = val.to_s
- end
- # rebuild Cookie header (just name=value; name2=value2)
- merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ")
- Object.send(:remove_const, :COOKIE_HDR) rescue nil
- Object.const_set(:COOKIE_HDR, merged)
- end
- def http_get(url, referer: nil, jar: nil, headers: {})
- max_redirects = 5
- current_url = url
- last_res = nil
- loop do
- h = HEADERS.merge(headers || {})
- h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty?
- res = Http.request(
- URI.parse(current_url),
- headers: h,
- jar: (jar || {}), # harmless; we now control cookies explicitly
- referer: referer
- )
- last_res = res
- merge_set_cookie!(res) # <-- capture any Set-Cookie
- status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
- loc = res["location"] rescue nil
- if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0
- referer = current_url
- current_url = URI.join(current_url, loc).to_s
- next
- end
- return res.body
- end
- end
- def kv_from_table(tbl)
- out = {}
- tbl.css("tr").each do |tr|
- k = tr.at_css("td.headerColumn")&.text&.strip
- v = tr.css("td")[1]&.text&.strip
- next if k.nil? || v.nil? || k.empty?
- out[k] = v
- end
- out
- end
- def absolute(base, href)
- return nil if href.to_s.empty?
- URI.join(base, href).to_s
- rescue URI::InvalidURIError
- nil
- end
- def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
- def filename_from_response(res, fallback)
- cd = res["content-disposition"].to_s
- if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
- return safe_name($1)
- end
- base = safe_name(fallback || "document")
- ct = res["content-type"].to_s.downcase
- ext = ct.include?("pdf") ? ".pdf" : ".bin"
- "#{base}#{ext}"
- end
- def variants_for_doc_list(url)
- u = URI.parse(url)
- q = URI.decode_www_form(u.query || "").to_h
- danum = q["DANUM"]
- key = q["KEY"]
- # base set (raw + encoded DANUM) on original path
- seeds = []
- unless danum.nil?
- # raw
- u_raw = u.dup
- u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum))
- seeds << u_raw.to_s
- # encoded (let encode_www_form do it once)
- if danum.include?("/")
- u_enc = u.dup
- u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum))
- seeds << u_enc.to_s
- end
- else
- seeds << u.to_s
- end
- # path case variants (/PublicNotices/ and /Publicnotices/)
- paths = seeds.flat_map do |s|
- s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] :
- s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s]
- end
- # remove /P1/ variants
- paths2 = paths.flat_map do |s|
- s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s]
- end
- # add route params r & f (common ones for this site)
- with_routes = paths2.flat_map do |s|
- uri = URI.parse(s)
- qq = URI.decode_www_form(uri.query || "").to_h
- next [s] if qq.key?("r") && qq.key?("f")
- [
- s,
- begin
- uri2 = uri.dup
- uri2.query = URI.encode_www_form(qq.merge(
- "r" => "P1.LCC.WEBGUEST",
- "f" => "$P1.ESB.PUBNOT.VIW"
- ))
- uri2.to_s
- rescue URI::InvalidURIError
- s
- end
- ]
- end
- with_routes.uniq
- end
- # ---- update download_doc to accept the shared jar ----
- def download_doc(url, referer:, council_reference:, jar:)
- dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference))
- FileUtils.mkdir_p(dir)
- h = { "Cookie" => COOKIE_HDR }.merge(HEADERS) # send the same browser-ish headers
- res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer)
- merge_set_cookie!(res)
- bytes = res.body
- fname = filename_from_response(res, File.basename(URI.parse(url).path))
- path = File.join(dir, fname)
- File.binwrite(path, bytes)
- path
- end
- def probe_common_docs(base_url:, key:, danum:, referer:)
- # danum may be URL-encoded; normalise first
- danum_raw = CGI.unescape(danum.to_s)
- # "DA0324/2025" -> "DA0324-2025"
- danum_slug = danum_raw.gsub("/", "-")
- names = [
- "Advertised plans",
- "Advertised Plans",
- "Onsite Notice",
- "Onsite notice",
- "Onsite Notice ", # trailing space variant seen on this site
- ]
- # Build candidates with percent-encoded filenames (spaces → %20).
- # URI.parse rejects bare spaces, so the filename portion must be encoded.
- candidates = names.map do |n|
- filename = "#{danum_slug} - #{n}.pdf"
- encoded = filename.gsub(" ", "%20")
- "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}"
- end
- found = []
- candidates.each do |pdf_url|
- begin
- h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0")
- # Use the doclist page itself as referer (some installs care)
- res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer)
- merge_set_cookie!(res)
- code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
- ct = res["content-type"].to_s.downcase
- if (code == 200 || code == 206) && ct.include?("pdf")
- local_rel = nil
- if DOWNLOAD_ATTACHMENTS
- begin
- saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR)
- local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}"
- rescue StandardError => e
- warn "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}"
- end
- end
- found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel }
- end
- rescue StandardError => e
- warn "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}"
- next
- end
- end
- found
- end
- html = http_get(URL, jar: SESSION_JAR)
- doc = Nokogiri::HTML(html)
- tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid")
- kept = 0
- tables.each do |t|
- kv = kv_from_table(t)
- council_reference = kv["Application ID"].to_s.strip
- description = kv["Application Description"].to_s.strip
- address = kv["Property Address"].to_s.strip
- closing_raw = kv["Closing Date"].to_s.strip
- closing_date = Util.parse_aus_date(closing_raw)
- details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href")
- info_url = absolute(URL, details_rel)
- next if council_reference.empty? || address.empty?
- # Base upsert (stores list-page fields; date_received comes from details page later)
- DB.upsert(TABLE, {
- council_reference: council_reference,
- description: description,
- address: address,
- closing_date: closing_date,
- closing_date_raw: closing_raw,
- info_url: info_url,
- applicant: "",
- owner: ""
- })
- # Enrich from details page + collect documents
- if info_url
- begin
- d_html = http_get(info_url, referer: URL, jar: SESSION_JAR)
- d_doc = Nokogiri::HTML(d_html)
- # Flatten all key/value grids into a single map
- details_kv = {}
- d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid|
- details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old }
- end
- applicant_name = details_kv["Applicant Name(s)"].to_s.strip
- status_text = details_kv["Status"].to_s.strip
- assigned_off = details_kv["Assigned Officer"].to_s.strip
- group_text = details_kv["Group"].to_s.strip
- category_text = details_kv["Category"].to_s.strip
- received_raw = details_kv["Application Received"].to_s.strip
- valid_raw = details_kv["Application Valid"].to_s.strip
- advertised_raw = details_kv["Advertised On"].to_s.strip
- legal_desc = details_kv["Property Legal Description"].to_s.strip
- received_date = Util.parse_aus_date(received_raw)
- valid_date = Util.parse_aus_date(valid_raw)
- advertised_date = Util.parse_aus_date(advertised_raw)
- # ---- Document listing page (docget.asp -> PNDocumentList) ----
- doc_list_url = nil
- # primary selector
- if (docget = d_doc.at_css("a[href*='docget.asp']"))
- doc_list_url = absolute(info_url, docget["href"])
- end
- # fallback: some instances link text varies or use different casing/paths
- if doc_list_url.nil?
- if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]"))
- doc_list_url = absolute(info_url, alt["href"])
- end
- end
- documents = [] # [{name:, url:, local_url:}, ...]
- if doc_list_url
- begin
- list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR)
- list_doc = Nokogiri::HTML(list_html)
- doc_anchors = list_doc.css("#PNDocumentList a")
- if doc_anchors.empty?
- # Fallbacks (case-insensitive) via XPath:
- doc_anchors = list_doc.xpath(
- "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
- "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
- )
- end
- documents = [] if documents.nil?
- anchors_added = 0
- used_url = nil
- probe_done = false # ensure probe_common_docs fires at most once per DA
- referers = [
- info_url, # details page
- URL, # notices list page
- "#{BASE_URL}/eProperty/" # root
- ]
- variants_for_doc_list(doc_list_url).each do |candidate_url|
- break if anchors_added > 0
- referers.each do |ref|
- break if anchors_added > 0
- begin
- list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR)
- list_doc = Nokogiri::HTML(list_html)
- # Strict then fallback selectors
- doc_anchors = list_doc.css("#PNDocumentList a")
- if doc_anchors.empty?
- doc_anchors = list_doc.xpath(
- "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
- "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
- )
- end
- doc_anchors.each do |a|
- name = a.text.strip
- href = absolute(candidate_url, a["href"])
- next if href.nil? || (name.empty? && href.to_s.strip.empty?)
- local_rel = nil
- if DOWNLOAD_ATTACHMENTS
- begin
- saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR)
- local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}"
- rescue StandardError => e
- warn "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}"
- end
- end
- documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel }
- anchors_added += 1
- end
- # Final fallback: probe known filenames directly (runs at most once per DA)
- if anchors_added == 0 && !probe_done
- probe_done = true
- begin
- u = URI.parse(doc_list_url)
- q = URI.decode_www_form(u.query || "").to_h
- key = q["KEY"]
- danum = q["DANUM"] || council_reference
- if key && danum
- probed = probe_common_docs(
- base_url: BASE_URL,
- key: key,
- danum: danum,
- referer: doc_list_url
- )
- documents.concat(probed)
- anchors_added = probed.size if probed.any?
- end
- rescue StandardError => e
- warn "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- if anchors_added > 0
- used_url = candidate_url
- puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})"
- break
- else
- # Save the first empty response body to inspect (once per app)
- begin
- dump_dir = "/app/tmp/launceston_doclist_dumps"
- FileUtils.mkdir_p(dump_dir)
- File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000])
- rescue StandardError => e
- warn "Failed to write dump for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- rescue StandardError => e
- warn "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}"
- end
- end
- end
- if used_url.nil?
- warn "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}"
- end
- rescue StandardError => e
- warn "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- first_doc_url = documents.first&.dig(:url)
- first_local = documents.first&.dig(:local_url)
-
- puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url
- puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url
- DB.upsert(TABLE, {
- # --- always include your base fields again ---
- council_reference: council_reference,
- description: description,
- address: address,
- info_url: info_url,
- on_notice_to: closing_date,
- on_notice_to_raw: closing_raw,
- # --- enrich fields from details page ---
- applicant: applicant_name,
- status: status_text,
- assigned_officer: assigned_off,
- group: group_text,
- category: category_text,
- date_received: received_date,
- date_received_raw: received_raw,
- application_valid: valid_date,
- application_valid_raw: valid_raw,
- advertised_on: advertised_date,
- advertised_on_raw: advertised_raw,
- property_legal_description: legal_desc,
- # --- documents ---
- pdf_url: first_doc_url, # <-- was document_url
- local_document_url: first_local, # keep
- documents_json: JSON.generate(documents) # full set
- })
- rescue StandardError => e
- warn "Enrich failed for #{council_reference}: #{e.class} #{e.message}"
- end
- end
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- #info_url: info_url
- )
- puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})"
- kept += 1
- end
- puts "Done #{TABLE}. Found #{kept}, saved #{kept}."
|