# Waratah–Wynyard Council — Advertised / Planning Applications require "nokogiri" require "uri" require "cgi" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") # da_waratah_wynyard URL = "https://www.warwyn.tas.gov.au/planning-and-development/advertised-permits/" DB.ensure_table!(TABLE) def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue URI::InvalidURIError href.to_s end # DA 2025/0123, DA2025-0123, DA 114-2025 etc. REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i def extract_ref(str) s = CGI.unescape(str.to_s) if (m = s.match(REF_RX1)) then return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX2)) then return "DA #{m[1]} / #{m[2]}" end if (m = s.match(REF_RX3)) then return "DA #{m[2]} / #{m[1]}" end nil end def extract_date_like(str) s = str.to_s return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/ return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/ return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/ "" end def extract_on_notice_raw(text) s = text.to_s.gsub(/\s+/, " ") if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i d = extract_date_like($2); return d unless d.empty? end if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i d = extract_date_like($2); return d unless d.empty? end if s =~ /submissions?\s*close\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i d = extract_date_like($1); return d unless d.empty? end extract_date_like(s) end def nearest_context_text(a) host = a.ancestors("li, p, div, tr, article").first || a.parent host ? host.text.to_s.strip.gsub(/\s+/, " ") : "" end def parse_list_items(doc, base_url) rows = [] anchors = doc.css("a").select { |a| href = a["href"].to_s a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf") } anchors.each do |a| href = a["href"].to_s link_text = a.text.to_s.strip document_url = abs_url(base_url, href) ctx = nearest_context_text(a) title_reference = link_text.empty? ? ctx[0,200] : link_text text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ") # Address guess address = link_text.length >= 6 ? link_text : ctx[0, 140] ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url)) on_raw = extract_on_notice_raw(text_for_parse) on_dt = Util.parse_aus_date(on_raw) description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i $1.strip else "Development Application" end next if ref.nil? || address.to_s.strip.empty? rows << { council_reference: ref, address: address.to_s.strip, description: description, date_received: on_dt, date_received_raw: on_raw, document_url: document_url, title_reference: title_reference } end rows.uniq { |r| [r[:council_reference], r[:address]] } end def parse_detail_page(url) html = Http.get(url) doc = Nokogiri::HTML(html) # Try simple two-column tables first kv = {} doc.css("table tr").each do |tr| cells = tr.css("th, td") next unless cells.length >= 2 key = cells[0].text.strip val = cells[1].text.strip kv[key] = val unless key.empty? end if kv.any? find = ->(rx) { kv.find { |k, _| k =~ rx }&.last.to_s.strip } council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i) address = find.call(/(Address|Location|Property)/i) description = find.call(/(Proposal|Description)/i) on_notice_raw = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes|Submissions)/i) pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href") document_url = pdf ? abs_url(url, pdf) : "" unless council_reference.empty? || address.empty? return { council_reference: council_reference, address: address, description: description.empty? ? "Development Application" : description, date_received_raw: on_notice_raw, date_received: Util.parse_aus_date(on_notice_raw), document_url: document_url, title_reference: doc.at_css("h1, .entry-title")&.text&.strip.to_s } end end # Fallback: parse from page text page_text = doc.text.to_s.strip.gsub(/\s+/, " ") ref = extract_ref(page_text) on_raw = extract_on_notice_raw(page_text) on_dt = Util.parse_aus_date(on_raw) h1 = doc.at_css("h1, .entry-title")&.text&.strip.to_s address = h1.empty? ? page_text[0, 140] : h1 pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href") document_url = pdf ? abs_url(url, pdf) : "" return nil if ref.nil? || address.empty? { council_reference: ref, address: address, description: "Development Application", date_received_raw: on_raw, date_received: on_dt, document_url: document_url, title_reference: h1 } end begin html = URL.include?("/eservice/") ? Http.dorset_session_get(URL) : Http.get(URL) rescue StandardError => e Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}" exit 1 end doc = Nokogiri::HTML(html) host = begin URI.parse(URL).host rescue URI::InvalidURIError nil end anchors = doc.css("a").map { |a| abs_url(URL, a["href"].to_s) }.select { |u| next false if u.empty? || u.start_with?("#") u.downcase.end_with?(".pdf") || begin uh = URI.parse(u).host rescue nil host && uh == host end }.uniq rows = [] anchors.each do |u| if u.downcase.end_with?(".pdf") if (a = doc.at_css(%Q{a[href="#{u}"]})) ctx_text = nearest_context_text(a) title = a.text.to_s.strip ref = extract_ref([title, ctx_text].join(" — ")) addr = title.length >= 6 ? title : ctx_text[0, 140] on_raw = extract_on_notice_raw([title, ctx_text].join(" — ")) on_dt = Util.parse_aus_date(on_raw) next if ref.nil? || addr.to_s.strip.empty? rows << { council_reference: ref, address: addr, description: "Development Application", date_received: on_dt, date_received_raw: on_raw, document_url: u, title_reference: title.empty? ? ctx_text[0,200] : title } end else begin item = parse_detail_page(u) rows << item if item rescue StandardError => e Log.warn "scraper", "Skip detail #{u}: #{e.class} #{e.message}" end end end # Safety net: scrape items from the main page content too rows += parse_list_items(doc, URL) rows.uniq! { |r| [r[:council_reference], r[:address]] } puts "Found #{rows.length} item(s) for #{TABLE}" rows.each do |r| cr = r[:council_reference].to_s addr = r[:address].to_s next if addr.strip.empty? next if addr =~ /\A(?:download|advertised planning applications)\z/i next if cr.strip.empty? next if addr == cr DB.upsert(TABLE, { description: r[:description], date_received: r[:date_received], date_received_raw: r[:date_received_raw], address: addr, council_reference: cr, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: cr, address: addr ) begin upd = DB.client.prepare( "UPDATE `#{DB.client.escape(TABLE)}` " \ "SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \ "WHERE council_reference = ? AND address = ?" ) upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:title_reference], cr, addr) rescue StandardError => e Log.warn "scraper", "Extras update skipped for #{cr}: #{e.class} #{e.message}" end puts "Upserted #{cr} -> #{addr}" end puts "Done #{TABLE}."