benjamin.harris
/
tas_councils


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
							# Northern Midlands Council — Advertised / Planning Applications (site page)

require "nokogiri"
require "uri"
require "cgi"

require_relative "../lib/scraper_helpers"
require_relative "../lib/log"
TABLE = ENV.fetch("TABLE_NAME")                    # run_all.sh -> da_northernmidlands
URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"

DB.ensure_table!(TABLE)

# “DA 2025/00123”, “DA2025/00123”, “Application No. DA 2025/123”
REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i

def extract_ref(str)
  s = CGI.unescape(str.to_s)
  if (m = s.match(REF_RX1))
    return "DA #{m[1]} / #{m[2]}"
  end
  if (m = s.match(REF_RX2))
    return "DA #{m[1]} / #{m[2]}"
  end
  nil
end

DATE_RX = /
  (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
   \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
   \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
/x

def extract_on_notice_raw(text)
  s = text.to_s.gsub(/\s+/, " ")
  if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
    if (d = m[2].match(DATE_RX))
      return d[1]
    end
  end
  if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
    if (d = m[2].match(DATE_RX))
      return d[1]
    end
  end
  if (d = s.match(DATE_RX))
    return d[1]
  end
  ""
end


def nearest_context_text(a)
  host = a.ancestors("li, p, div, tr, article").first || a.parent
  host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
end

def parse_items(doc, base_url)
  rows = []

  # 1) Obvious list items or rows with PDFs or application keywords
  anchors = doc.css("a").select { |a|
    href = a["href"].to_s
    a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
  }

  anchors.each do |a|
    href = a["href"].to_s
    link_text = a.text.to_s.strip
    document_url = abs_url(base_url, href)
    ctx = nearest_context_text(a)

    # Title to keep, if present
    title_reference = link_text.empty? ? ctx[0,200] : link_text

    text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")

    # Address: prefer the link text, else the surrounding text slice
    address = if link_text.length >= 6
      link_text
    else
      ctx[0, 140]
    end

    # Reference from text or file name
    ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))

    # On-notice
    on_raw = extract_on_notice_raw(text_for_parse)
    on_dt  = Util.parse_aus_date(on_raw)

    # Description
    description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
      $1.strip
    else
      "Development Application"
    end

    next if ref.nil? || address.to_s.strip.empty?

    rows << {
      council_reference: ref,
      address: address.to_s.strip,
      description: description,
      date_received: on_dt,
      date_received_raw: on_raw,
      document_url: document_url,
      title_reference: title_reference
    }
  end

  # 2) If the page uses a two-column details table, pick that up too
  doc.css("table").each do |t|
    heads = t.css("th").map { |th| th.text.strip.downcase }
    next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }

    t.css("tr").each do |tr|
      cells = tr.css("td")
      next unless cells.length >= 2
      row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
      ref = extract_ref(row_text)
      addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
      on_raw = extract_on_notice_raw(row_text)
      on_dt  = Util.parse_aus_date(on_raw)
      next if ref.nil? || addr.to_s.strip.empty?
      rows << {
        council_reference: ref,
        address: addr.to_s.strip,
        description: "Development Application",
        date_received: on_dt,
        date_received_raw: on_raw,
        document_url: "",
        title_reference: row_text[0,200]
      }
    end
  end

  rows.uniq { |r| [r[:council_reference], r[:address]] }
end

if URL.empty?
  Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n  ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
  exit 0
end

begin
  html = if URL.include?("/eservice/")
    # Some councils use ePathway, which needs a cookie-warmed session
    Http.dorset_session_get(URL)
  else
    Http.get(URL)
  end
rescue StandardError => e
  Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
  exit 1
end

doc = Nokogiri::HTML(html)
items = parse_items(doc, URL)

puts "Found #{items.length} item(s) for #{TABLE}"

items.each do |r|
  upsert_and_enrich!(
    table: TABLE,
    row: {
      description: r[:description],
      date_received: r[:date_received],
      date_received_raw: r[:date_received_raw],
      address: r[:address],
      council_reference: r[:council_reference],
      applicant: "",
      owner: ""
    },
    extras: {
      document_url:     r[:document_url],
      on_notice_to:     r[:date_received],
      on_notice_to_raw: r[:date_received_raw],
      title_reference:  r[:title_reference]
    }
  )
end

puts "Done #{TABLE}."