benjamin.harris
/
tas_councils


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
							# Clarence City Council — Advertised Plans
# Source list: https://www.ccc.tas.gov.au/development/advertised-plans/

require "nokogiri"
require "cgi"
require "uri"
require "date"
require "fileutils"
require_relative "../lib/http"
require_relative "../lib/db"
require_relative "../lib/util"
require_relative "../lib/geocode"
require_relative "../lib/enrich"

TABLE = ENV.fetch("TABLE_NAME")  # run_all.sh -> da_clarence
URL   = "https://www.ccc.tas.gov.au/development/advertised-plans/"

DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"

DB.ensure_table!(TABLE)

def abs_url(base, href)
    return "" if href.to_s.strip.empty?
    URI.join(base, href).to_s
rescue URI::InvalidURIError
    href.to_s
end

def extract_app_number(text)
    text.to_s[/Application\s*Number:\s*([A-Za-z0-9\/\-\._]+)/i, 1].to_s.strip
end

def extract_close_raw(text)
    text.to_s[/Closes:\s*([^\n\r<]+)/i, 1].to_s.strip
end

def parse_date_token(s)
    s = s.to_s
    return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
        return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
        return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
        ""
        end

    def looks_like_address(s)
        s =~ /\d{1,4}\s+\S+/ && s =~ /,\s*[A-Z][A-Z]+/
    end

    def split_title(title)
        parts = title.split(/\s+–\s+/) # en dash
        parts = title.split(/\s+-\s+/) if parts.length < 2
        parts.map!(&:strip)
        parts
    end

    def pick_address_from_title(parts)
        parts.find { |p| looks_like_address(p) } || parts.find { |p| p =~ /\d/ } || parts[1].to_s
    end

    def pick_description_from_title(parts, code, address)
        parts.find { |p| p != code && p != address && p.length > 3 }.to_s
    end

    def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")

        # Download the PDF (if enabled) and return a web path like:
        # /downloads/clarence/<council_reference>/<filename.pdf>
        def download_pdf(url, council_reference)
            return nil unless DOWNLOAD_ATTACHMENTS && !url.to_s.strip.empty?

            folder = File.join(DOWNLOAD_DIR, "clarence", safe_name(council_reference))
            FileUtils.mkdir_p(folder)

            begin
                res  = Http.get_response(url) rescue Http.get(url)
                body = res.respond_to?(:body) ? res.body : res.to_s

                fname = safe_name(File.basename(URI.parse(url).path))
                fname += ".pdf" unless fname.downcase.end_with?(".pdf")
                path = File.join(folder, fname)

                File.binwrite(path, body)
                puts "Saved PDF #{path}"

                # Web-accessible path (served by your web container)
                "/downloads/clarence/#{safe_name(council_reference)}/#{fname}"
            rescue StandardError => e
                Log.warn "scraper", "PDF download failed for #{url}: #{e.class} #{e.message}"
                nil
            end
        end

        list_html = Http.get(URL)
        doc       = Nokogiri::HTML(list_html)

        items = []

        # Headings tend to be h2/h3, followed by blocks that contain
        # “Closes:” and “Application Number:” and a PDF link.
        doc.css("h2, h3").each do |h|
            title = h.text.to_s.strip
            next if title.empty?

            texts   = []
            pdf_url = ""
            node    = h
            12.times do
                node = node.next_element
                break if node.nil? || node.name =~ /^h[23]$/i
                texts << node.text.to_s.strip
                if (a = node.at_css("a[href]"))
                    href = a["href"].to_s
                    if href =~ /\.pdf($|\?)/i || href.include?("assets.ccc.tas.gov.au")
                        pdf_url = abs_url(URL, href)
                    end
                end
            end
            detail_text = texts.join("\n")

            app_no_raw = extract_app_number(detail_text)
            closes_raw = extract_close_raw(detail_text)
            closes_tok = parse_date_token(closes_raw)
            on_notice  = Util.parse_aus_date(closes_tok)

            parts   = split_title(title)
            code    = parts.first.to_s
            address = pick_address_from_title(parts).to_s
            desc    = pick_description_from_title(parts, code, address)
            desc    = "Development Application" if desc.strip.empty?

            council_reference = app_no_raw.empty? ? code : app_no_raw
            next if council_reference.strip.empty? || address.strip.empty?

            items << {
                council_reference: council_reference,
                address: address,
                description: desc,
                on_notice_raw: closes_tok,
                on_notice: on_notice,
                pdf: pdf_url,
                title_reference: title
                }
        end

        items.uniq! { |r| [r[:council_reference], r[:address]] }

        puts "Found #{items.length} item(s) for #{TABLE}"

        date_received	 = Date.today

        items.each do |r|
            cr   = r[:council_reference].to_s
            addr = r[:address].to_s

            # Skip site promo / competitions that occasionally appear as a “heading”
            next if cr =~ /turn your two cents/i || r[:title_reference].to_s =~ /two cents/i

            # Skip if we didn’t get a sensible address
            next if addr.strip.empty? || addr == cr

            # Clarence app numbers look like PDPLANPMTD-2025/054004 etc
            next unless cr =~ /\APDPLAN[A-Z]*-\d{4}\/\d+\z/

            DB.upsert(TABLE, {
                description: r[:description],
                date_received: date_received,
                on_notice_to: r[:on_notice],
                on_notice_to_raw: r[:on_notice_raw],
                address: addr,
                council_reference: cr,
                applicant: "",
                owner: ""
                })

            enrich_after_upsert!(
                table: TABLE,
                council_reference: cr,
                address: addr
                )

            # Try to download and set local_document_url
            local_doc_url = download_pdf(r[:pdf], cr)

            begin
                upd = DB.client.prepare(
                    "UPDATE `#{DB.client.escape(TABLE)}` " \
                        "SET document_url = ?, " \
                        "    local_document_url = COALESCE(?, local_document_url), " \
                        "    on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \
                        "WHERE council_reference = ? AND address = ?"
                    )
                upd.execute(r[:pdf], local_doc_url, r[:on_notice], r[:on_notice_raw], r[:title_reference], cr, addr)
            rescue StandardError => e
                Log.warn "scraper", "Extras update skipped for #{cr}: #{e.class} #{e.message}"
            end

            puts "Upserted #{cr} -> #{addr}  saved: #{local_doc_url ? 1 : 0}"
        end

        puts "Done #{TABLE}."