benjamin.harris
/
tas_councils


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
							# Kentish Council — Advertised / Planning Applications (site page, not PlanBuild)

require "nokogiri"
require "uri"
require "cgi"

require_relative "../lib/enrich"
require_relative "../lib/log"
require_relative "../lib/util"
TABLE = ENV.fetch("TABLE_NAME")              # run_all.sh -> da_kentish
# Set this to the exact page you use for Kentish (from your original file)
URL   = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"

DB.ensure_table!(TABLE)

def abs_url(base, href)
  h = href.to_s.strip
  return nil if h.empty?
  return h if h.start_with?("http://", "https://")
  URI.join(base, h).to_s
rescue URI::InvalidURIError
  h
end

# Kentish uses K-DA{number}/{year} format, e.g. K-DA016/2026
REF_RX = /\bK-DA\d+\/20\d{2}\b/i

def parse_items(doc, base_url)
  rows = []

  # Each DA is a <li class="generic-list__item"> with a PDF link in the title
  # Link text: "K-DA016/2026 41 George Road, Nook - proposed 2 Lot Subdivision (submissions by 21/04/2026)"
  doc.css("li.generic-list__item").each do |li|
    link = li.at_css("h3.generic-list__title a, a[href$='.pdf']")
    next unless link

    raw_text = link.text.gsub(/\(PDF File[^)]*\)/i, "").gsub(/\s+/, " ").strip
    pdf_href = link["href"].to_s

    ref_match = raw_text.match(REF_RX)
    next unless ref_match

    ref  = ref_match[0]
    rest = raw_text.sub(ref, "").strip

    # Extract on-notice date: "(submissions by 21/04/2026)"
    on_raw = rest[/\(submissions\s+by\s+([^)]+)\)/i, 1]&.strip || ""
    on_dt  = Util.parse_aus_date(on_raw)

    # Strip the on-notice clause and split "address - description"
    body = rest.sub(/\s*\(submissions\s+by\s+[^)]+\)/i, "").strip
    if (m = body.match(/\A(.+?)\s+-\s+(.+)\z/))
      address     = m[1].strip
      description = m[2].strip
    else
      address     = body
      description = "Development Application"
    end

    next if address.empty?

    rows << {
      council_reference: ref,
      address:           address[0, 255],
      description:       description,
      on_notice_to:      on_dt,
      on_notice_to_raw:  on_raw,
      document_url:      abs_url(base_url, pdf_href)
    }
  end

  rows
end

begin
  html = Http.get(URL)
rescue StandardError => e
  Log.warn "kentish", "Failed to fetch #{URL}: #{e.class} #{e.message}"
  exit 1
end

# Kentish Council's site is protected by Cloudflare JS challenge.
# When blocked, the page title is "Just a moment..." and contains no DA data.
# Note: Kentish DAs are also published on PlanBuild (council code KEN),
# so planbuild.rb covers this council independently.
if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
  Log.warn "kentish", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code KEN)."
  puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
  exit 0
end

doc   = Nokogiri::HTML(html)
items = parse_items(doc, URL)

puts "Found #{items.length} item(s) for #{TABLE}"

saved = 0
items.each do |r|
  begin
    DB.upsert(TABLE, {
      description:       r[:description],
      on_notice_to:      r[:on_notice_to],
      on_notice_to_raw:  r[:on_notice_to_raw],
      address:           r[:address],
      council_reference: r[:council_reference],
      document_url:      r[:document_url],
      applicant:         "",
      owner:             ""
    })

    enrich_after_upsert!(
      table:             TABLE,
      council_reference: r[:council_reference],
      address:           r[:address]
    )

    Log.info "kentish", "Upserted #{r[:council_reference]} -> #{r[:address]}"
    saved += 1
  rescue StandardError => e
    Log.warn "kentish", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
  end
end

puts "Done #{TABLE}. Saved #{saved} item(s)."