# Break O'Day Council — Advertised Development Applications # List page: https://www.bodc.tas.gov.au/council/advertised-development-applications/ require "nokogiri" require "cgi" require "uri" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_break_oday URL = "https://www.bodc.tas.gov.au/council/advertised-development-applications/" DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" DB.ensure_table!(TABLE) def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end # Accepts "DA 2025/123", "DA2025/0123", "DA 054-2026", etc REF_RXES = [ %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i, # DA 2025/123 %r{\bDA(20\d{2})\s*[-/]?\s*([0-9]{3,})\b}i, # DA2025-0123 %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i # DA 054-2026 ] def extract_ref(str) s = CGI.unescape(str.to_s) REF_RXES.each do |rx| if (m = s.match(rx)) # normalize to "DA YYYY / NNN" if rx.source.include?("\\s*-\\s*") # hyphen form "054-2026" return "DA #{m[2]} / #{m[1]}" else return "DA #{m[1]} / #{m[2]}" end end end nil end def find_list_table(doc) doc.css("table").find do |t| heads = t.css("thead th").map { |th| th.text.strip.downcase } heads.any? && ( heads.any? { |h| h.include?("closing") } || heads.any? { |h| h.include?("pdf") } || heads.any? { |h| h.include?("name") } ) end end def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_") def download_pdf(url, council_reference) return nil unless DOWNLOAD_ATTACHMENTS && !url.to_s.strip.empty? folder = File.join(DOWNLOAD_DIR, 'breakoday', safe_name(council_reference)) FileUtils.mkdir_p(folder) begin res = Http.get_response(url) rescue Http.get(url) # If Http.get already gives us the body, use it directly body = res.respond_to?(:body) ? res.body : res.to_s fname = safe_name(File.basename(URI.parse(url).path)) fname += ".pdf" unless fname.downcase.end_with?(".pdf") path = File.join(folder, fname) File.binwrite(path, body) puts "Saved PDF #{path}" # return web-accessible relative path if needed "/files/breakoday/#{safe_name(council_reference)}/#{fname}" rescue StandardError => e Log.warn "scraper", "PDF download failed for #{url}: #{e.class} #{e.message}" nil end end html = Http.get(URL) doc = Nokogiri::HTML(html) table = find_list_table(doc) || doc.at_css("table") unless table puts "No table found on #{URL}" exit 0 end # Work out the column indexes by header text if possible headers = table.css("thead th").map { |th| th.text.strip.downcase } idx_name = headers.index { |h| h.include?("name") } || 0 idx_addr = headers.index { |h| h.include?("address") } || 1 idx_close = headers.index { |h| h.include?("closing") || h.include?("notice") } || 2 idx_pdf = headers.index { |h| h.include?("pdf") } || 3 rows = table.css("tbody tr") puts "Found #{rows.length} row(s) for #{TABLE}" saved = 0 rows.each do |tr| tds = tr.css("td") next if tds.empty? name_text = tds[idx_name]&.text&.strip.to_s address = tds[idx_addr]&.text&.strip.to_s close_raw = tds[idx_close]&.text&.strip.to_s pdf_cell = tds[idx_pdf] pdf_a = pdf_cell&.at_css("a[href]") document_url = pdf_a ? abs_url(URL, pdf_a["href"].to_s) : "" row_text = tr.text.to_s.gsub(/\s+/, " ") raw_ref = extract_ref(pdf_cell&.text) || extract_ref(File.basename(document_url)) || extract_ref(row_text) council_reference = raw_ref&.gsub(/\s*\/\s*/, "_")&.gsub(/\s+/, "_") next if address.empty? || council_reference.nil? on_notice = Util.parse_aus_date(close_raw) description = name_text.empty? ? "Development Application" : name_text local_doc_url = download_pdf(document_url, council_reference) DB.upsert(TABLE, { description: description, address: address, council_reference: council_reference, applicant: "", owner: "", document_url: document_url, local_document_url: local_doc_url, on_notice_to: on_notice, on_notice_to_raw: close_raw, }) enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address) tn = DB.client.escape(TABLE) sql = %Q{ SELECT address_std, lat, lng FROM `#{tn}` WHERE council_reference = ? AND address = ? LIMIT 1 } begin row = DB.client.prepare(sql).execute(council_reference, address).first puts " enriched -> #{row ? row.inspect : 'nil'}" rescue StandardError => e Log.warn "scraper", " enriched probe failed: #{e.class} #{e.message}" end puts "Upserted #{council_reference} -> #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."