kentish.rb 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # Kentish Council — Advertised / Planning Applications (site page, not PlanBuild)
  2. require "nokogiri"
  3. require "uri"
  4. require "cgi"
  5. require_relative "../lib/enrich"
  6. require_relative "../lib/log"
  7. require_relative "../lib/util"
  8. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_kentish
  9. # Set this to the exact page you use for Kentish (from your original file)
  10. URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"
  11. DB.ensure_table!(TABLE)
  12. def abs_url(base, href)
  13. h = href.to_s.strip
  14. return nil if h.empty?
  15. return h if h.start_with?("http://", "https://")
  16. URI.join(base, h).to_s
  17. rescue URI::InvalidURIError
  18. h
  19. end
  20. # Kentish uses K-DA{number}/{year} format, e.g. K-DA016/2026
  21. REF_RX = /\bK-DA\d+\/20\d{2}\b/i
  22. def parse_items(doc, base_url)
  23. rows = []
  24. # Each DA is a <li class="generic-list__item"> with a PDF link in the title
  25. # Link text: "K-DA016/2026 41 George Road, Nook - proposed 2 Lot Subdivision (submissions by 21/04/2026)"
  26. doc.css("li.generic-list__item").each do |li|
  27. link = li.at_css("h3.generic-list__title a, a[href$='.pdf']")
  28. next unless link
  29. raw_text = link.text.gsub(/\(PDF File[^)]*\)/i, "").gsub(/\s+/, " ").strip
  30. pdf_href = link["href"].to_s
  31. ref_match = raw_text.match(REF_RX)
  32. next unless ref_match
  33. ref = ref_match[0]
  34. rest = raw_text.sub(ref, "").strip
  35. # Extract on-notice date: "(submissions by 21/04/2026)"
  36. on_raw = rest[/\(submissions\s+by\s+([^)]+)\)/i, 1]&.strip || ""
  37. on_dt = Util.parse_aus_date(on_raw)
  38. # Strip the on-notice clause and split "address - description"
  39. body = rest.sub(/\s*\(submissions\s+by\s+[^)]+\)/i, "").strip
  40. if (m = body.match(/\A(.+?)\s+-\s+(.+)\z/))
  41. address = m[1].strip
  42. description = m[2].strip
  43. else
  44. address = body
  45. description = "Development Application"
  46. end
  47. next if address.empty?
  48. rows << {
  49. council_reference: ref,
  50. address: address[0, 255],
  51. description: description,
  52. on_notice_to: on_dt,
  53. on_notice_to_raw: on_raw,
  54. document_url: abs_url(base_url, pdf_href)
  55. }
  56. end
  57. rows
  58. end
  59. begin
  60. html = Http.get(URL)
  61. rescue StandardError => e
  62. Log.warn "kentish", "Failed to fetch #{URL}: #{e.class} #{e.message}"
  63. exit 1
  64. end
  65. # Kentish Council's site is protected by Cloudflare JS challenge.
  66. # When blocked, the page title is "Just a moment..." and contains no DA data.
  67. # Note: Kentish DAs are also published on PlanBuild (council code KEN),
  68. # so planbuild.rb covers this council independently.
  69. if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
  70. Log.warn "kentish", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code KEN)."
  71. puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
  72. exit 0
  73. end
  74. doc = Nokogiri::HTML(html)
  75. items = parse_items(doc, URL)
  76. puts "Found #{items.length} item(s) for #{TABLE}"
  77. saved = 0
  78. items.each do |r|
  79. begin
  80. DB.upsert(TABLE, {
  81. description: r[:description],
  82. on_notice_to: r[:on_notice_to],
  83. on_notice_to_raw: r[:on_notice_to_raw],
  84. address: r[:address],
  85. council_reference: r[:council_reference],
  86. document_url: r[:document_url],
  87. applicant: "",
  88. owner: ""
  89. })
  90. enrich_after_upsert!(
  91. table: TABLE,
  92. council_reference: r[:council_reference],
  93. address: r[:address]
  94. )
  95. Log.info "kentish", "Upserted #{r[:council_reference]} -> #{r[:address]}"
  96. saved += 1
  97. rescue StandardError => e
  98. Log.warn "kentish", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
  99. end
  100. end
  101. puts "Done #{TABLE}. Saved #{saved} item(s)."