# launceston_eproperty.rb require "nokogiri" require "uri" require "fileutils" require "json" require "cgi" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") BASE_URL = "https://onlineservice.launceston.tas.gov.au" URL = ENV.fetch( "EPROPERTY_URL", "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ" ) DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests HEADERS = { "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language" => "en-AU,en;q=0.9", "Accept-Encoding" => "identity" } DB.ensure_table!(TABLE) COOKIE_HDR = "" # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly" def merge_set_cookie!(res) sc = res["set-cookie"] return if sc.nil? || sc.empty? # normalize to an array of cookie strings parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/) # current cookie hash -> {name => value} cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h parts.each do |raw| kv = raw.split(";", 2).first name, val = kv.split("=", 2) next if name.to_s.strip.empty? cur[name.strip] = val.to_s end # rebuild Cookie header (just name=value; name2=value2) merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ") Object.send(:remove_const, :COOKIE_HDR) rescue nil Object.const_set(:COOKIE_HDR, merged) end def http_get(url, referer: nil, jar: nil, headers: {}) max_redirects = 5 current_url = url last_res = nil loop do h = HEADERS.merge(headers || {}) h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty? res = Http.request( URI.parse(current_url), headers: h, jar: (jar || {}), # harmless; we now control cookies explicitly referer: referer ) last_res = res merge_set_cookie!(res) # <-- capture any Set-Cookie status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200 loc = res["location"] rescue nil if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0 referer = current_url current_url = URI.join(current_url, loc).to_s next end return res.body end end def kv_from_table(tbl) out = {} tbl.css("tr").each do |tr| k = tr.at_css("td.headerColumn")&.text&.strip v = tr.css("td")[1]&.text&.strip next if k.nil? || v.nil? || k.empty? out[k] = v end out end def absolute(base, href) return nil if href.to_s.empty? URI.join(base, href).to_s rescue URI::InvalidURIError nil end def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_") def filename_from_response(res, fallback) cd = res["content-disposition"].to_s if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/ return safe_name($1) end base = safe_name(fallback || "document") ct = res["content-type"].to_s.downcase ext = ct.include?("pdf") ? ".pdf" : ".bin" "#{base}#{ext}" end def variants_for_doc_list(url) u = URI.parse(url) q = URI.decode_www_form(u.query || "").to_h danum = q["DANUM"] key = q["KEY"] # base set (raw + encoded DANUM) on original path seeds = [] unless danum.nil? # raw u_raw = u.dup u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum)) seeds << u_raw.to_s # encoded (let encode_www_form do it once) if danum.include?("/") u_enc = u.dup u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum)) seeds << u_enc.to_s end else seeds << u.to_s end # path case variants (/PublicNotices/ and /Publicnotices/) paths = seeds.flat_map do |s| s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] : s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s] end # remove /P1/ variants paths2 = paths.flat_map do |s| s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s] end # add route params r & f (common ones for this site) with_routes = paths2.flat_map do |s| uri = URI.parse(s) qq = URI.decode_www_form(uri.query || "").to_h next [s] if qq.key?("r") && qq.key?("f") [ s, begin uri2 = uri.dup uri2.query = URI.encode_www_form(qq.merge( "r" => "P1.LCC.WEBGUEST", "f" => "$P1.ESB.PUBNOT.VIW" )) uri2.to_s rescue URI::InvalidURIError s end ] end with_routes.uniq end # ---- update download_doc to accept the shared jar ---- def download_doc(url, referer:, council_reference:, jar:) dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference)) FileUtils.mkdir_p(dir) h = { "Cookie" => COOKIE_HDR }.merge(HEADERS) # send the same browser-ish headers res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer) merge_set_cookie!(res) bytes = res.body fname = filename_from_response(res, File.basename(URI.parse(url).path)) path = File.join(dir, fname) File.binwrite(path, bytes) path end def probe_common_docs(base_url:, key:, danum:, referer:) # danum may be URL-encoded; normalise first danum_raw = CGI.unescape(danum.to_s) # "DA0324/2025" -> "DA0324-2025" danum_slug = danum_raw.gsub("/", "-") names = [ "Advertised plans", "Advertised Plans", "Onsite Notice", "Onsite notice", "Onsite Notice ", # trailing space variant seen on this site ] # Build candidates with percent-encoded filenames (spaces → %20). # URI.parse rejects bare spaces, so the filename portion must be encoded. candidates = names.map do |n| filename = "#{danum_slug} - #{n}.pdf" encoded = filename.gsub(" ", "%20") "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}" end found = [] candidates.each do |pdf_url| begin h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0") # Use the doclist page itself as referer (some installs care) res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer) merge_set_cookie!(res) code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200 ct = res["content-type"].to_s.downcase if (code == 200 || code == 206) && ct.include?("pdf") local_rel = nil if DOWNLOAD_ATTACHMENTS begin saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR) local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}" rescue StandardError => e Log.warn "scraper", "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}" end end found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel } end rescue StandardError => e Log.warn "scraper", "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}" next end end found end html = http_get(URL, jar: SESSION_JAR) doc = Nokogiri::HTML(html) tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid") kept = 0 tables.each do |t| kv = kv_from_table(t) council_reference = kv["Application ID"].to_s.strip description = kv["Application Description"].to_s.strip address = kv["Property Address"].to_s.strip closing_raw = kv["Closing Date"].to_s.strip closing_date = Util.parse_aus_date(closing_raw) details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href") info_url = absolute(URL, details_rel) next if council_reference.empty? || address.empty? # Base upsert (stores list-page fields; date_received comes from details page later) DB.upsert(TABLE, { council_reference: council_reference, description: description, address: address, on_notice_to: closing_date, on_notice_to_raw: closing_raw, applicant: "", owner: "" }) # Enrich from details page + collect documents if info_url begin d_html = http_get(info_url, referer: URL, jar: SESSION_JAR) d_doc = Nokogiri::HTML(d_html) # Flatten all key/value grids into a single map details_kv = {} d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid| details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old } end applicant_name = details_kv["Applicant Name(s)"].to_s.strip status_text = details_kv["Status"].to_s.strip assigned_off = details_kv["Assigned Officer"].to_s.strip group_text = details_kv["Group"].to_s.strip category_text = details_kv["Category"].to_s.strip received_raw = details_kv["Application Received"].to_s.strip valid_raw = details_kv["Application Valid"].to_s.strip advertised_raw = details_kv["Advertised On"].to_s.strip legal_desc = details_kv["Property Legal Description"].to_s.strip received_date = Util.parse_aus_date(received_raw) valid_date = Util.parse_aus_date(valid_raw) advertised_date = Util.parse_aus_date(advertised_raw) # ---- Document listing page (docget.asp -> PNDocumentList) ---- doc_list_url = nil # primary selector if (docget = d_doc.at_css("a[href*='docget.asp']")) doc_list_url = absolute(info_url, docget["href"]) end # fallback: some instances link text varies or use different casing/paths if doc_list_url.nil? if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]")) doc_list_url = absolute(info_url, alt["href"]) end end documents = [] # [{name:, url:, local_url:}, ...] if doc_list_url begin list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR) list_doc = Nokogiri::HTML(list_html) doc_anchors = list_doc.css("#PNDocumentList a") if doc_anchors.empty? # Fallbacks (case-insensitive) via XPath: doc_anchors = list_doc.xpath( "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \ "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]" ) end documents = [] if documents.nil? anchors_added = 0 used_url = nil probe_done = false # ensure probe_common_docs fires at most once per DA referers = [ info_url, # details page URL, # notices list page "#{BASE_URL}/eProperty/" # root ] variants_for_doc_list(doc_list_url).each do |candidate_url| break if anchors_added > 0 referers.each do |ref| break if anchors_added > 0 begin list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR) list_doc = Nokogiri::HTML(list_html) # Strict then fallback selectors doc_anchors = list_doc.css("#PNDocumentList a") if doc_anchors.empty? doc_anchors = list_doc.xpath( "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \ "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]" ) end doc_anchors.each do |a| name = a.text.strip href = absolute(candidate_url, a["href"]) next if href.nil? || (name.empty? && href.to_s.strip.empty?) local_rel = nil if DOWNLOAD_ATTACHMENTS begin saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR) local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}" rescue StandardError => e Log.warn "scraper", "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}" end end documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel } anchors_added += 1 end # Final fallback: probe known filenames directly (runs at most once per DA) if anchors_added == 0 && !probe_done probe_done = true begin u = URI.parse(doc_list_url) q = URI.decode_www_form(u.query || "").to_h key = q["KEY"] danum = q["DANUM"] || council_reference if key && danum probed = probe_common_docs( base_url: BASE_URL, key: key, danum: danum, referer: doc_list_url ) documents.concat(probed) anchors_added = probed.size if probed.any? end rescue StandardError => e Log.warn "scraper", "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}" end end if anchors_added > 0 used_url = candidate_url puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})" break else # Save the first empty response body to inspect (once per app) begin dump_dir = "/app/tmp/launceston_doclist_dumps" FileUtils.mkdir_p(dump_dir) File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000]) rescue StandardError => e Log.warn "scraper", "Failed to write dump for #{council_reference}: #{e.class} #{e.message}" end end rescue StandardError => e Log.warn "scraper", "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}" end end end if used_url.nil? Log.warn "scraper", "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}" end rescue StandardError => e Log.warn "scraper", "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}" end end first_doc_url = documents.first&.dig(:url) first_local = documents.first&.dig(:local_url) puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url DB.upsert(TABLE, { council_reference: council_reference, address: address, applicant: applicant_name, date_received: received_date, date_received_raw: received_raw, document_url: first_doc_url, local_document_url: first_local, documents_json: documents.empty? ? nil : JSON.generate(documents), status: status_text, assigned_officer: assigned_off, group: group_text, category: category_text, application_valid: valid_date, application_valid_raw: valid_raw, advertised_on: advertised_date, advertised_on_raw: advertised_raw, property_legal_description: legal_desc }) rescue StandardError => e Log.warn "scraper", "Enrich failed for #{council_reference}: #{e.class} #{e.message}" end end enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address #info_url: info_url ) puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})" kept += 1 end puts "Done #{TABLE}. Saved #{kept} item(s)."