| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- # Kentish Council — Advertised / Planning Applications (site page, not PlanBuild)
- require "nokogiri"
- require "uri"
- require "cgi"
- require_relative "../lib/enrich"
- require_relative "../lib/log"
- require_relative "../lib/util"
- TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_kentish
- # Set this to the exact page you use for Kentish (from your original file)
- URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"
- DB.ensure_table!(TABLE)
- def abs_url(base, href)
- h = href.to_s.strip
- return nil if h.empty?
- return h if h.start_with?("http://", "https://")
- URI.join(base, h).to_s
- rescue URI::InvalidURIError
- h
- end
- # Kentish uses K-DA{number}/{year} format, e.g. K-DA016/2026
- REF_RX = /\bK-DA\d+\/20\d{2}\b/i
- def parse_items(doc, base_url)
- rows = []
- # Each DA is a <li class="generic-list__item"> with a PDF link in the title
- # Link text: "K-DA016/2026 41 George Road, Nook - proposed 2 Lot Subdivision (submissions by 21/04/2026)"
- doc.css("li.generic-list__item").each do |li|
- link = li.at_css("h3.generic-list__title a, a[href$='.pdf']")
- next unless link
- raw_text = link.text.gsub(/\(PDF File[^)]*\)/i, "").gsub(/\s+/, " ").strip
- pdf_href = link["href"].to_s
- ref_match = raw_text.match(REF_RX)
- next unless ref_match
- ref = ref_match[0]
- rest = raw_text.sub(ref, "").strip
- # Extract on-notice date: "(submissions by 21/04/2026)"
- on_raw = rest[/\(submissions\s+by\s+([^)]+)\)/i, 1]&.strip || ""
- on_dt = Util.parse_aus_date(on_raw)
- # Strip the on-notice clause and split "address - description"
- body = rest.sub(/\s*\(submissions\s+by\s+[^)]+\)/i, "").strip
- if (m = body.match(/\A(.+?)\s+-\s+(.+)\z/))
- address = m[1].strip
- description = m[2].strip
- else
- address = body
- description = "Development Application"
- end
- next if address.empty?
- rows << {
- council_reference: ref,
- address: address[0, 255],
- description: description,
- on_notice_to: on_dt,
- on_notice_to_raw: on_raw,
- document_url: abs_url(base_url, pdf_href)
- }
- end
- rows
- end
- begin
- html = Http.get(URL)
- rescue StandardError => e
- Log.warn "kentish", "Failed to fetch #{URL}: #{e.class} #{e.message}"
- exit 1
- end
- # Kentish Council's site is protected by Cloudflare JS challenge.
- # When blocked, the page title is "Just a moment..." and contains no DA data.
- # Note: Kentish DAs are also published on PlanBuild (council code KEN),
- # so planbuild.rb covers this council independently.
- if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
- Log.warn "kentish", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code KEN)."
- puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
- exit 0
- end
- doc = Nokogiri::HTML(html)
- items = parse_items(doc, URL)
- puts "Found #{items.length} item(s) for #{TABLE}"
- saved = 0
- items.each do |r|
- begin
- DB.upsert(TABLE, {
- description: r[:description],
- on_notice_to: r[:on_notice_to],
- on_notice_to_raw: r[:on_notice_to_raw],
- address: r[:address],
- council_reference: r[:council_reference],
- document_url: r[:document_url],
- applicant: "",
- owner: ""
- })
- enrich_after_upsert!(
- table: TABLE,
- council_reference: r[:council_reference],
- address: r[:address]
- )
- Log.info "kentish", "Upserted #{r[:council_reference]} -> #{r[:address]}"
- saved += 1
- rescue StandardError => e
- Log.warn "kentish", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
- end
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|