kentish.rb 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. # Kentish Council — Advertised / Planning Applications (site page, not PlanBuild)
  2. require "nokogiri"
  3. require "uri"
  4. require "cgi"
  5. require_relative "../lib/http"
  6. require_relative "../lib/db"
  7. require_relative "../lib/util"
  8. require_relative "../lib/enrich"
  9. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_kentish
  10. # Set this to the exact page you use for Kentish (from your original file)
  11. URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp"
  12. DB.ensure_table!(TABLE)
  13. def abs_url(base, href)
  14. return "" if href.to_s.strip.empty?
  15. URI.join(base, href).to_s rescue href.to_s
  16. end
  17. # Reference formats like:
  18. # DA 2025/00123
  19. # DA2025/00123
  20. # Application No. DA 2025/123
  21. REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i # DA 2025/0123
  22. REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i # DA2025-0123 or DA2025/0123
  23. REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i # DA 114-2025
  24. def extract_ref(str)
  25. s = CGI.unescape(str.to_s)
  26. if (m = s.match(REF_RX1))
  27. return "DA #{m[1]} / #{m[2]}"
  28. end
  29. if (m = s.match(REF_RX2))
  30. return "DA #{m[1]} / #{m[2]}"
  31. end
  32. if (m = s.match(REF_RX3))
  33. return "DA #{m[2]} / #{m[1]}"
  34. end
  35. nil
  36. end
  37. DATE_RX = /
  38. (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
  39. \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
  40. \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
  41. /x
  42. def extract_on_notice_raw(text)
  43. s = text.to_s.gsub(/\s+/, " ")
  44. if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  45. if (d = m[2].match(DATE_RX))
  46. return d[1]
  47. end
  48. end
  49. if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  50. if (d = m[2].match(DATE_RX))
  51. return d[1]
  52. end
  53. end
  54. if (d = s.match(DATE_RX))
  55. return d[1]
  56. end
  57. ""
  58. end
  59. def first_meaningful_text(node)
  60. return "" unless node
  61. t = node.text.to_s.strip.gsub(/\s+/, " ")
  62. t
  63. end
  64. def nearest_context_text(a)
  65. host = a.ancestors("li, p, div, tr").first || a.parent
  66. first_meaningful_text(host)
  67. end
  68. def parse_document_list(doc, base_url)
  69. # Look for clear “items”: pdf links, or list/table rows containing one
  70. anchors = doc.css("a").select { |a|
  71. href = a["href"].to_s
  72. a.text.to_s.strip.match?(/application|permit|advertis/i) || href.downcase.end_with?(".pdf")
  73. }
  74. rows = []
  75. anchors.each do |a|
  76. href = a["href"].to_s
  77. pdf = abs_url(base_url, href)
  78. ctx = nearest_context_text(a)
  79. link_text = a.text.to_s.strip
  80. text_for_parse = [link_text, ctx].uniq.join(" — ")
  81. # Try to pull fields
  82. ref = extract_ref(text_for_parse)
  83. addr = if link_text.length > 6
  84. link_text
  85. else
  86. ctx[0, 140]
  87. end
  88. on_raw = extract_on_notice_raw(text_for_parse)
  89. on_dt = Util.parse_aus_date(on_raw)
  90. desc = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
  91. $1.strip
  92. else
  93. "Development Application"
  94. end
  95. next if ref.nil? || addr.to_s.strip.empty?
  96. rows << {
  97. council_reference: ref,
  98. address: addr.to_s.strip,
  99. description: desc,
  100. date_received: on_dt,
  101. date_received_raw: on_raw,
  102. document_url: pdf
  103. }
  104. end
  105. rows
  106. end
  107. begin
  108. html = Http.get(URL)
  109. rescue StandardError => e
  110. warn "Failed to fetch #{URL}: #{e.class} #{e.message}"
  111. exit 1
  112. end
  113. doc = Nokogiri::HTML(html)
  114. items = parse_document_list(doc, URL)
  115. puts "Found #{items.length} item(s) for #{TABLE}"
  116. items.each do |r|
  117. DB.upsert(TABLE, {
  118. description: r[:description],
  119. date_received: r[:date_received],
  120. date_received_raw: r[:date_received_raw],
  121. address: r[:address],
  122. council_reference: r[:council_reference],
  123. applicant: "",
  124. owner: ""
  125. })
  126. enrich_after_upsert!(
  127. table: TABLE,
  128. council_reference: council_reference,
  129. address: address
  130. )
  131. begin
  132. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ? WHERE council_reference = ? AND address = ?")
  133. upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:council_reference], r[:address])
  134. rescue StandardError => e
  135. warn "Extras update skipped for #{r[:council_reference]}: #{e.class} #{e.message}"
  136. end
  137. puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
  138. end
  139. puts "Done #{TABLE}."