# Flinders Council — Current Advertising scraper (site page, not PlanBuild) require "nokogiri" require "cgi" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets this from filename: da_flinders URL = "https://www.flinders.tas.gov.au/current-advertising" DB.ensure_table!(TABLE) # Optional column to keep the PDF link begin DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL") rescue => e warn "document_url add skipped: #{e.class} #{e.message}" end def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end def extract_ref_from(text) s = CGI.unescape(text.to_s) # Pattern 1: "DA 2025 / 00017" or "DA 2025/00017" if (m = s.match(/DA\s*20\d{2}\s*\/\s*\d{3,}/i)) return m[0].gsub(/\s+/, " ").sub(" / ", " / ").strip end # Pattern 2: "DA202500017" -> "DA 2025 / 00017" if (m = s.match(/DA20\d{2}\d{5}/i)) raw = m[0] year = raw[2,4] num = raw[6,5] return "DA #{year} / #{num}" end nil end html = Http.get(URL) doc = Nokogiri::HTML(html) # Pick all advertised PDFs listed on the page links = doc.css("a").select { |a| href = a["href"].to_s href.match?(%r{/client-assets/images/Development/Downloads/Advertising/}i) && href.downcase.end_with?(".pdf") } puts "Found #{links.length} items for #{TABLE}" saved = 0 links.each do |a| text = a.text.strip.gsub(/\s+/, " ") pdf = abs_url(URL, a["href"]) # Use link text first, fall back to file name as address address = text.empty? ? File.basename(pdf).sub(/\.pdf\z/i, "") : text # Council reference from link text or filename ref = extract_ref_from(text) || extract_ref_from(File.basename(pdf)) # Description and dates are usually inside the PDF, so keep minimal fields here description = "Development Application" date_received_raw = "" date_received = nil next if address.empty? || ref.nil? DB.upsert(TABLE, { description: description, date_received: date_received, date_received_raw: date_received_raw, address: address, council_reference: ref, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?") upd.execute(pdf, ref, address) rescue Mysql2::Error => e warn "[flinders] db update skipped for #{ref}: #{e.message}" end puts "Upserted #{ref} -> #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."