# Central Highlands Council — Development Applications (site page) require "nokogiri" require "cgi" require_relative "../lib/http" require_relative "../lib/util" require_relative "../lib/scraper_helpers" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_centralhighlands URL = "https://centralhighlands.tas.gov.au/development-applications/" DB.ensure_table!(TABLE) # DA 2025/42, DA2025/00042, etc. REF_RX = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i def extract_ref(str) s = CGI.unescape(str.to_s) if (m = s.match(%r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i)) return "DA #{m[1]} / #{m[2]}" end if (m = s.match(%r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i)) return "DA #{m[1]} / #{m[2]}" end nil end def extract_addr(text) if (m = text.match(/Location:\s*(.+?)(?:\s*\/|\s{2,}|$)/i)) m[1].strip else "" end end def extract_proposal(text) if (m = text.match(/Proposal:\s*(.+?)(?:\s{2,}|$)/i)) m[1].strip else "" end end def extract_close_raw(text) s = text.gsub(/\s+/, " ") # "… until 20 August 2025" if (m = s.match(/\buntil\s+([0-9]{1,2}\s+[A-Za-z]{3,}\s+[0-9]{4})\b/i)) return m[1] end # fallback date tokens return $1 if s =~ /(\b[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4}\b)/ return $1 if s =~ /(\b[0-9]{1,2}\s+[A-Za-z]{3,}\s+[0-9]{4}\b)/ "" end # Central Highlands Council's site has been unreachable (connection timeout). # DAs for this council are also published on PlanBuild (council code CEH), # so planbuild.rb covers this council independently. html = begin Http.get(URL) rescue StandardError => e Log.warn "centralhighlands", "Failed to fetch #{URL}: #{e.class} #{e.message}. DAs are available via planbuild.rb (council code CEH)." puts "Done #{TABLE}. Saved 0 item(s) — site unreachable." exit 0 end if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies") Log.warn "centralhighlands", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code CEH)." puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare." exit 0 end doc = Nokogiri::HTML(html) container = doc.at_css("main, .entry-content, article") || doc # Grab anchors that look like the advertised docs links = container.css("a").select { |a| a.text =~ /click here to view application/i || a["href"].to_s.downcase.end_with?(".pdf") } puts "Found #{links.length} candidate link(s) for #{TABLE}" saved = 0 links.each_with_index do |a, idx| pdf = abs_url(URL, a["href"]) # Walk up to a nearby block and use its text to find fields host = a.ancestors("p, li, div").first || a.parent text = host ? host.text.strip : "" text = text.gsub(/\s+/, " ") address = extract_addr(text) description = extract_proposal(text) on_notice_raw = extract_close_raw(text) on_notice = Util.parse_aus_date(on_notice_raw) # Reference: proposal first, then file name, then surrounding text ref = extract_ref(description) || extract_ref(File.basename(pdf)) || extract_ref(text) # If we still have no address, fall back to a slice of the text address = text[0, 140] if address.empty? next if address.empty? || ref.nil? upsert_and_enrich!( table: TABLE, row: { description: description.empty? ? "Development Application" : description, date_received: on_notice, date_received_raw: on_notice_raw, address: address, council_reference: ref, applicant: "", owner: "" }, extras: { document_url: pdf } ) saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."