benjamin.harris
/
tas_councils


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
							# launceston_eproperty.rb
require "nokogiri"
require "uri"
require "fileutils"
require "json"
require "cgi"

require_relative "../lib/enrich"
require_relative "../lib/log"
require_relative "../lib/util"
TABLE        = ENV.fetch("TABLE_NAME")
BASE_URL     = "https://onlineservice.launceston.tas.gov.au"
URL          = ENV.fetch(
  "EPROPERTY_URL",
  "#{BASE_URL}/eProperty/P1/PublicNotices/AllPublicNotices.aspx?r=P1.LCC.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
)
DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"

SESSION_JAR = {} # shared cookie jar for ASP.NET session across requests

HEADERS = {
  "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
  "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language" => "en-AU,en;q=0.9",
  "Accept-Encoding" => "identity"
}

DB.ensure_table!(TABLE)

COOKIE_HDR = ""  # e.g. "ASP.NET_SessionId=xyz; Path=/eProperty; HttpOnly"

def merge_set_cookie!(res)
  sc = res["set-cookie"]
  return if sc.nil? || sc.empty?

  # normalize to an array of cookie strings
  parts = sc.is_a?(Array) ? sc : sc.to_s.split(/,(?=[^;]+?=)/)

  # current cookie hash -> {name => value}
  cur = COOKIE_HDR.split(/;\s*/).map { |p| p.split("=", 2) }.to_h

  parts.each do |raw|
    kv = raw.split(";", 2).first
    name, val = kv.split("=", 2)
    next if name.to_s.strip.empty?
    cur[name.strip] = val.to_s
  end

  # rebuild Cookie header (just name=value; name2=value2)
  merged = cur.map { |k, v| "#{k}=#{v}" }.join("; ")
  Object.send(:remove_const, :COOKIE_HDR) rescue nil
  Object.const_set(:COOKIE_HDR, merged)
end


def http_get(url, referer: nil, jar: nil, headers: {})
  max_redirects = 5
  current_url = url
  last_res = nil

  loop do
    h = HEADERS.merge(headers || {})
    h["Cookie"] = COOKIE_HDR unless COOKIE_HDR.empty?

    res = Http.request(
      URI.parse(current_url),
      headers: h,
      jar: (jar || {}),     # harmless; we now control cookies explicitly
      referer: referer
    )
    last_res = res
    merge_set_cookie!(res)  # <-- capture any Set-Cookie

    status = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
    loc    = res["location"] rescue nil
    if status.between?(300, 399) && loc && (max_redirects -= 1) >= 0
      referer = current_url
      current_url = URI.join(current_url, loc).to_s
      next
    end
    return res.body
  end
end


def kv_from_table(tbl)
  out = {}
  tbl.css("tr").each do |tr|
    k = tr.at_css("td.headerColumn")&.text&.strip
    v = tr.css("td")[1]&.text&.strip
    next if k.nil? || v.nil? || k.empty?
    out[k] = v
  end
  out
end

def absolute(base, href)
  return nil if href.to_s.empty?
  URI.join(base, href).to_s
rescue URI::InvalidURIError
  nil
end

def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")

def filename_from_response(res, fallback)
  cd = res["content-disposition"].to_s
  if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
    return safe_name($1)
  end
  base = safe_name(fallback || "document")
  ct = res["content-type"].to_s.downcase
  ext = ct.include?("pdf") ? ".pdf" : ".bin"
  "#{base}#{ext}"
end

def variants_for_doc_list(url)
  u = URI.parse(url)
  q = URI.decode_www_form(u.query || "").to_h
  danum = q["DANUM"]
  key   = q["KEY"]

  # base set (raw + encoded DANUM) on original path
  seeds = []
  unless danum.nil?
    # raw
    u_raw = u.dup
    u_raw.query = URI.encode_www_form(q.merge("DANUM" => danum))
    seeds << u_raw.to_s
    # encoded (let encode_www_form do it once)
    if danum.include?("/")
      u_enc = u.dup
      u_enc.query = URI.encode_www_form(q.merge("DANUM" => danum))
      seeds << u_enc.to_s
    end
  else
    seeds << u.to_s
  end

  # path case variants (/PublicNotices/ and /Publicnotices/)
  paths = seeds.flat_map do |s|
    s.include?("/PublicNotices/") ? [s, s.sub("/PublicNotices/", "/Publicnotices/")] :
    s.include?("/Publicnotices/") ? [s, s.sub("/Publicnotices/", "/PublicNotices/")] : [s]
  end

  # remove /P1/ variants
  paths2 = paths.flat_map do |s|
    s.include?("/eProperty/P1/") ? [s, s.sub("/eProperty/P1/", "/eProperty/")] : [s]
  end

  # add route params r & f (common ones for this site)
  with_routes = paths2.flat_map do |s|
    uri = URI.parse(s)
    qq  = URI.decode_www_form(uri.query || "").to_h
    next [s] if qq.key?("r") && qq.key?("f")
    [
      s,
      begin
        uri2 = uri.dup
        uri2.query = URI.encode_www_form(qq.merge(
          "r" => "P1.LCC.WEBGUEST",
          "f" => "$P1.ESB.PUBNOT.VIW"
        ))
        uri2.to_s
      rescue URI::InvalidURIError
        s
      end
    ]
  end

  with_routes.uniq
end

# ---- update download_doc to accept the shared jar ----
def download_doc(url, referer:, council_reference:, jar:)
  dir = File.join(DOWNLOAD_DIR, "launceston", safe_name(council_reference))
  FileUtils.mkdir_p(dir)
  h = { "Cookie" => COOKIE_HDR }.merge(HEADERS)  # send the same browser-ish headers
  res = Http.request(URI.parse(url), headers: h, jar: jar, referer: referer)
  merge_set_cookie!(res)

  bytes = res.body
  fname = filename_from_response(res, File.basename(URI.parse(url).path))
  path  = File.join(dir, fname)
  File.binwrite(path, bytes)
  path
end

def probe_common_docs(base_url:, key:, danum:, referer:)
  # danum may be URL-encoded; normalise first
  danum_raw  = CGI.unescape(danum.to_s)
  # "DA0324/2025" -> "DA0324-2025"
  danum_slug = danum_raw.gsub("/", "-")

  names = [
    "Advertised plans",
    "Advertised Plans",
    "Onsite Notice",
    "Onsite notice",
    "Onsite Notice ",  # trailing space variant seen on this site
  ]

  # Build candidates with percent-encoded filenames (spaces → %20).
  # URI.parse rejects bare spaces, so the filename portion must be encoded.
  candidates = names.map do |n|
    filename = "#{danum_slug} - #{n}.pdf"
    encoded  = filename.gsub(" ", "%20")
    "#{BASE_URL}/eProperty/Publicnotices/#{key}/#{encoded}"
  end

  found = []
  candidates.each do |pdf_url|
    begin
      h = HEADERS.merge("Cookie" => (COOKIE_HDR || ""), "Range" => "bytes=0-0")
      # Use the doclist page itself as referer (some installs care)
      res = Http.request(URI.parse(pdf_url), headers: h, jar: {}, referer: referer)
      merge_set_cookie!(res)

      code = (res.respond_to?(:code) ? res.code : res["status"]).to_i rescue 200
      ct   = res["content-type"].to_s.downcase

      if (code == 200 || code == 206) && ct.include?("pdf")
        local_rel = nil
        if DOWNLOAD_ATTACHMENTS
          begin
            saved = download_doc(pdf_url, referer: referer, council_reference: danum_raw, jar: SESSION_JAR)
            local_rel = "/files/launceston/#{safe_name(danum_raw)}/#{File.basename(saved)}"
          rescue StandardError => e
            Log.warn "scraper", "DOC download failed (probe) for #{danum_raw} #{File.basename(pdf_url)}: #{e.class} #{e.message}"
          end
        end
        found << { name: File.basename(pdf_url), url: pdf_url, local_url: local_rel }
      end
    rescue StandardError => e
      Log.warn "scraper", "[launcestoncity] probe failed for #{pdf_url}: #{e.class} #{e.message}"
      next
    end
  end

  found
end

html = http_get(URL, jar: SESSION_JAR)
doc  = Nokogiri::HTML(html)

tables = doc.css("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid table.grid")

kept = 0
tables.each do |t|
  kv = kv_from_table(t)

  council_reference = kv["Application ID"].to_s.strip
  description       = kv["Application Description"].to_s.strip
  address           = kv["Property Address"].to_s.strip
  closing_raw       = kv["Closing Date"].to_s.strip
  closing_date      = Util.parse_aus_date(closing_raw)

  details_rel = t.at_css("a[href*='PublicNoticeDetails.aspx']")&.[]("href")
  info_url    = absolute(URL, details_rel)

  next if council_reference.empty? || address.empty?

  # Base upsert (stores list-page fields; date_received comes from details page later)
  DB.upsert(TABLE, {
    council_reference:   council_reference,
    description:         description,
    address:             address,
    on_notice_to:        closing_date,
    on_notice_to_raw:    closing_raw,
    applicant:           "",
    owner:               ""
  })

  # Enrich from details page + collect documents
  if info_url
    begin
      d_html = http_get(info_url, referer: URL, jar: SESSION_JAR)
	  d_doc  = Nokogiri::HTML(d_html)

      # Flatten all key/value grids into a single map
      details_kv = {}
      d_doc.css("#ctl00_Content_cusPageComponents_pnlPageComponents table.grid").each do |grid|
        details_kv.merge!(kv_from_table(grid)) { |_k, old, newv| old.to_s.strip.empty? ? newv : old }
      end

      applicant_name  = details_kv["Applicant Name(s)"].to_s.strip
      status_text     = details_kv["Status"].to_s.strip
      assigned_off    = details_kv["Assigned Officer"].to_s.strip
      group_text      = details_kv["Group"].to_s.strip
      category_text   = details_kv["Category"].to_s.strip

      received_raw    = details_kv["Application Received"].to_s.strip
      valid_raw       = details_kv["Application Valid"].to_s.strip
      advertised_raw  = details_kv["Advertised On"].to_s.strip
      legal_desc      = details_kv["Property Legal Description"].to_s.strip

      received_date   = Util.parse_aus_date(received_raw)
      valid_date      = Util.parse_aus_date(valid_raw)
      advertised_date = Util.parse_aus_date(advertised_raw)

      # ---- Document listing page (docget.asp -> PNDocumentList) ----
		doc_list_url = nil

		# primary selector
		if (docget = d_doc.at_css("a[href*='docget.asp']"))
		  doc_list_url = absolute(info_url, docget["href"])
		end

		# fallback: some instances link text varies or use different casing/paths
		if doc_list_url.nil?
		  if (alt = d_doc.at_xpath("//a[contains(translate(text(),'CLICK','click'),'click') and contains(translate(text(),'DOCUMENT','document'),'document')]"))
			doc_list_url = absolute(info_url, alt["href"])
		  end
		end

		documents = [] # [{name:, url:, local_url:}, ...]

		if doc_list_url
		  begin
			list_html = http_get(doc_list_url, referer: info_url, jar: SESSION_JAR)
			list_doc  = Nokogiri::HTML(list_html)

			doc_anchors = list_doc.css("#PNDocumentList a")
			if doc_anchors.empty?
			  # Fallbacks (case-insensitive) via XPath:
			  doc_anchors = list_doc.xpath(
				"//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
				"//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
			  )
			end

			documents = [] if documents.nil?
			anchors_added = 0
			used_url = nil
			probe_done = false  # ensure probe_common_docs fires at most once per DA

			referers = [
			  info_url, # details page
			  URL,      # notices list page
			  "#{BASE_URL}/eProperty/" # root
			]

			variants_for_doc_list(doc_list_url).each do |candidate_url|
			  break if anchors_added > 0
			  referers.each do |ref|
				break if anchors_added > 0
				begin
				  list_html = http_get(candidate_url, referer: ref, jar: SESSION_JAR)
				  list_doc  = Nokogiri::HTML(list_html)

				  # Strict then fallback selectors
				  doc_anchors = list_doc.css("#PNDocumentList a")
				  if doc_anchors.empty?
					doc_anchors = list_doc.xpath(
					  "//ul[contains(translate(@id,'DOCUMENTLIST','documentlist'),'documentlist')]//a | " \
					  "//a[contains(translate(@href,'PDF','pdf'),'.pdf')]"
					)
				  end

				  doc_anchors.each do |a|
					name = a.text.strip
					href = absolute(candidate_url, a["href"])
					next if href.nil? || (name.empty? && href.to_s.strip.empty?)

					local_rel = nil
					if DOWNLOAD_ATTACHMENTS
					  begin
						saved = download_doc(href, referer: candidate_url, council_reference: council_reference, jar: SESSION_JAR)
						local_rel = "/files/launceston/#{safe_name(council_reference)}/#{File.basename(saved)}"
					  rescue StandardError => e
						Log.warn "scraper", "DOC download failed for #{council_reference} #{name}: #{e.class} #{e.message}"
					  end
					end

					documents << { name: (name.empty? ? File.basename(href) : name), url: href, local_url: local_rel }
					anchors_added += 1
				  end

				  # Final fallback: probe known filenames directly (runs at most once per DA)
				  if anchors_added == 0 && !probe_done
					probe_done = true
				  begin
					u = URI.parse(doc_list_url)
					q = URI.decode_www_form(u.query || "").to_h
					key   = q["KEY"]
					danum = q["DANUM"] || council_reference
					if key && danum
					  probed = probe_common_docs(
						  base_url: BASE_URL,
						  key: key,
						  danum: danum,
						  referer: doc_list_url
						)
					  documents.concat(probed)
					  anchors_added = probed.size if probed.any?
					end
				  rescue StandardError => e
					Log.warn "scraper", "Probe fallback failed for #{council_reference}: #{e.class} #{e.message}"
				  end
				  end

				  if anchors_added > 0
					used_url = candidate_url
					puts "Docs list for #{council_reference}: #{candidate_url} (referer: #{ref})"
					break
				  else
					# Save the first empty response body to inspect (once per app)
					begin
					  dump_dir = "/app/tmp/launceston_doclist_dumps"
					  FileUtils.mkdir_p(dump_dir)
					  File.write(File.join(dump_dir, "#{safe_name(council_reference)}.html"), list_html[0, 5000])
					rescue StandardError => e
					  Log.warn "scraper", "Failed to write dump for #{council_reference}: #{e.class} #{e.message}"
					end
				  end

				rescue StandardError => e
				  Log.warn "scraper", "Doc list fetch failed for #{council_reference} at #{candidate_url} (referer: #{ref}): #{e.class} #{e.message}"
				end
			  end
			end

			if used_url.nil?
			  Log.warn "scraper", "Docs page had no usable links for #{council_reference} after variants: #{variants_for_doc_list(doc_list_url).join(' | ')}"
			end


		  rescue StandardError => e
			Log.warn "scraper", "Doc list fetch failed for #{council_reference}: #{e.class} #{e.message}"
		  end
		end

		first_doc_url = documents.first&.dig(:url)
		first_local   = documents.first&.dig(:local_url)
		
		puts "Docs list for #{council_reference}: #{doc_list_url}" if doc_list_url
		puts "Found #{documents.size} docs for #{council_reference}" if doc_list_url

		DB.upsert(TABLE, {
		  council_reference:  council_reference,
		  address:            address,
		  applicant:          applicant_name,
		  date_received:      received_date,
		  date_received_raw:  received_raw,
		  document_url:       first_doc_url,
		  local_document_url: first_local,
		  documents_json:     documents.empty? ? nil : JSON.generate(documents)
		})

    rescue StandardError => e
      Log.warn "scraper", "Enrich failed for #{council_reference}: #{e.class} #{e.message}"
    end
  end

  enrich_after_upsert!(
    table: TABLE,
    council_reference: council_reference,
    address: address
    #info_url: info_url
  )

  puts "Upserted #{council_reference} | #{address} (closes #{closing_raw})"
  kept += 1
end

puts "Done #{TABLE}. Saved #{kept} item(s)."