benjamin.harris
/
tas_councils


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
							# West Tamar Council — Advertised Planning Applications
#
# Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
#
# Page structure — all entries on one page, grouped by h2 headings:
#
#   <h2>92 Sunset Boulevard, Clarence Point</h2>
#   <p>
#     <strong>APPLICANT:</strong> J & E West<br>
#     <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
#     <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
#     <strong>CLOSES:</strong> 5pm on 16 April 2026
#   </p>
#   <ul>
#     <li>Application Number: PA NO: 2025065</li>
#     <li>Closes 16 April 2026</li>
#   </ul>
#   <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>

require "nokogiri"
require "uri"
require "fileutils"

require_relative "../lib/scraper_helpers"
require_relative "../lib/util"
require_relative "../lib/log"

TABLE                = ENV.fetch("TABLE_NAME")
URL                  = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"

DB.ensure_table!(TABLE)

def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")

def download_pdf(url, council_reference)
  return nil if url.to_s.strip.empty?

  dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
  FileUtils.mkdir_p(dir)

  fname = safe_name(File.basename(URI.parse(url).path))
  fname = "document.pdf" if fname.empty?
  path  = File.join(dir, fname)

  body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
  File.binwrite(path, body)
  puts "  saved #{fname} (#{body.bytesize} bytes)"

  "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
rescue StandardError => e
  Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
  nil
end

# Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
def parse_strong_labels(p_node)
  kv = {}
  return kv unless p_node

  # Replace <br> with newlines so we can split cleanly
  html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
  Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
    line = line.gsub(/\u00a0|\s+/, " ").strip
    next if line.empty?
    if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
      kv[m[1].strip.upcase] = m[2].strip
    end
  end
  kv
end

html     = Http.get(URL)
doc      = Nokogiri::HTML(html)
items    = []

# Walk h2 elements; collect their following siblings until the next h2
doc.css("h2").each do |h2|
  sibling_nodes = []
  sib = h2.next_sibling
  while sib
    break if sib.element? && sib.name == "h2"
    sibling_nodes << sib if sib.element?
    sib = sib.next_sibling
  end

  next if sibling_nodes.empty?

  # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
  label_p  = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
  kv       = parse_strong_labels(label_p)

  # Find the <ul> containing the application number
  ul_node  = sibling_nodes.find { |n| n.name == "ul" }
  ul_text  = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")

  # PDF link lives inside a <li> within the <ul>
  pdf_link = ul_node&.css("li a[href]")&.find { |a| a["href"].to_s =~ /\.pdf/i }
  # Fallback: any element in the section with a .pdf href
  pdf_link ||= sibling_nodes.flat_map { |n| n.css("a[href]").to_a }
                             .find { |a| a["href"].to_s =~ /\.pdf/i }

  # --- Reference: "PA NO: 2025065" from ul, or filename ---
  ref = nil
  if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
    ref = "PA #{m[1]}"
  end
  if ref.nil? && pdf_link
    href = pdf_link["href"].to_s
    ref  = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
  end
  next unless ref

  # --- Address from LOCATION label, fallback to h2 text ---
  address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
  next if address.empty?

  # --- Other fields ---
  applicant   = kv["APPLICANT"].to_s
  description = kv["PROPOSAL"].to_s
  description = "Development Application" if description.empty?

  closes_raw  = kv["CLOSES"].to_s
  # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
  closes_raw  = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
  # Also try list item: "Closes 16 April 2026"
  if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
    closes_raw = m[1]
  end
  on_notice_to = Util.parse_aus_date(closes_raw)

  document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""

  items << {
    council_reference: ref,
    address:           address,
    description:       description,
    applicant:         applicant,
    on_notice_to:      on_notice_to,
    on_notice_to_raw:  closes_raw,
    document_url:      document_url
  }
end

puts "Found #{items.length} item(s) for #{TABLE}"

items.each do |r|
  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil

  upsert_and_enrich!(
    table: TABLE,
    row: {
      council_reference: r[:council_reference],
      address:           r[:address],
      description:       r[:description],
      applicant:         r[:applicant],
      on_notice_to:      r[:on_notice_to],
      on_notice_to_raw:  r[:on_notice_to_raw],
      owner:             ""
    },
    extras: {
      document_url:       r[:document_url],
      local_document_url: local_url
    }
  )
end

puts "Done #{TABLE}. Saved #{items.length} item(s)."