| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- # Kentish Council — Advertised / Planning Applications (site page, not PlanBuild)
- require "nokogiri"
- require "uri"
- require "cgi"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_kentish
- # Set this to the exact page you use for Kentish (from your original file)
- URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"
- DB.ensure_table!(TABLE)
- def abs_url(base, href)
- return "" if href.to_s.strip.empty?
- URI.join(base, href).to_s rescue href.to_s
- end
- # Reference formats like:
- # DA 2025/00123
- # DA2025/00123
- # Application No. DA 2025/123
- REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i # DA 2025/0123
- REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i # DA2025-0123 or DA2025/0123
- REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i # DA 114-2025
- def extract_ref(str)
- s = CGI.unescape(str.to_s)
- if (m = s.match(REF_RX1))
- return "DA #{m[1]} / #{m[2]}"
- end
- if (m = s.match(REF_RX2))
- return "DA #{m[1]} / #{m[2]}"
- end
- if (m = s.match(REF_RX3))
- return "DA #{m[2]} / #{m[1]}"
- end
- nil
- end
- DATE_RX = /
- (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
- \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
- \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
- /x
- def extract_on_notice_raw(text)
- s = text.to_s.gsub(/\s+/, " ")
- if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
- if (d = m[2].match(DATE_RX))
- return d[1]
- end
- end
- if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
- if (d = m[2].match(DATE_RX))
- return d[1]
- end
- end
- if (d = s.match(DATE_RX))
- return d[1]
- end
- ""
- end
- def first_meaningful_text(node)
- return "" unless node
- t = node.text.to_s.strip.gsub(/\s+/, " ")
- t
- end
- def nearest_context_text(a)
- host = a.ancestors("li, p, div, tr").first || a.parent
- first_meaningful_text(host)
- end
- def parse_document_list(doc, base_url)
- # Look for clear “items”: pdf links, or list/table rows containing one
- anchors = doc.css("a").select { |a|
- href = a["href"].to_s
- a.text.to_s.strip.match?(/application|permit|advertis/i) || href.downcase.end_with?(".pdf")
- }
- rows = []
- anchors.each do |a|
- href = a["href"].to_s
- pdf = abs_url(base_url, href)
- ctx = nearest_context_text(a)
- link_text = a.text.to_s.strip
- text_for_parse = [link_text, ctx].uniq.join(" — ")
- # Try to pull fields
- ref = extract_ref(text_for_parse)
- addr = if link_text.length > 6
- link_text
- else
- ctx[0, 140]
- end
- on_raw = extract_on_notice_raw(text_for_parse)
- on_dt = Util.parse_aus_date(on_raw)
- desc = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
- $1.strip
- else
- "Development Application"
- end
- next if ref.nil? || addr.to_s.strip.empty?
- rows << {
- council_reference: ref,
- address: addr.to_s.strip,
- description: desc,
- date_received: on_dt,
- date_received_raw: on_raw,
- document_url: pdf
- }
- end
- rows
- end
- begin
- html = Http.get(URL)
- rescue StandardError => e
- Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
- exit 1
- end
- doc = Nokogiri::HTML(html)
- items = parse_document_list(doc, URL)
- puts "Found #{items.length} item(s) for #{TABLE}"
- items.each do |r|
- DB.upsert(TABLE, {
- description: r[:description],
- date_received: r[:date_received],
- date_received_raw: r[:date_received_raw],
- on_notice_to: r[:date_received],
- on_notice_to_raw: r[:date_received_raw],
- address: r[:address],
- council_reference: r[:council_reference],
- document_url: r[:document_url],
- applicant: "",
- owner: ""
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: r[:council_reference],
- address: r[:address]
- )
- puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
- end
- puts "Done #{TABLE}."
|