kingisland.rb 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # King Island Council — Advertised Development Applications
  2. #
  3. # Source: https://kingisland.tas.gov.au/develop/planning/
  4. #
  5. # The site returns HTTP 403 on direct requests but succeeds after a homepage
  6. # warmup using browser-like headers (same technique as burnie.rb).
  7. # Accept-Encoding: identity is used to avoid gzip decompression complexity.
  8. #
  9. # Page structure (WordPress accordion, id="accordion-1-c4"):
  10. # <h2>Advertised development applications</h2>
  11. # <p class="entry-title">...(preamble)...</p>
  12. # <p>Notice of Planning Application – DA 2025/28 15 Kurrajong Street,
  13. # Grassy, TAS 7256 – Visitor/workers' Accommodation.</p>
  14. # <p>...representations no later than 2 April 2026...</p>
  15. # <p><a href="https://kingisland.tas.gov.au/wp-content/uploads/DA-2025-28-...pdf">here</a></p>
  16. require "date"
  17. require "nokogiri"
  18. require "net/http"
  19. require "uri"
  20. require_relative "../lib/db"
  21. require_relative "../lib/enrich"
  22. require_relative "../lib/log"
  23. require_relative "../lib/util"
  24. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_kingisland
  25. BASE_URL = "https://kingisland.tas.gov.au"
  26. URL = "#{BASE_URL}/develop/planning/"
  27. DB.ensure_table!(TABLE)
  28. # ----- Browser-like headers (WAF warmup technique from burnie.rb) -----
  29. UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
  30. "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
  31. BASE_HEADERS = {
  32. "User-Agent" => UA,
  33. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  34. "Accept-Language" => "en-AU,en;q=0.8",
  35. "Accept-Encoding" => "identity",
  36. "Upgrade-Insecure-Requests" => "1",
  37. "Sec-Fetch-Dest" => "document",
  38. "Sec-Fetch-Mode" => "navigate",
  39. "Sec-Fetch-Site" => "none",
  40. "Sec-Fetch-User" => "?1",
  41. "sec-ch-ua" => '"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"',
  42. "sec-ch-ua-platform" => '"Windows"',
  43. "sec-ch-ua-mobile" => "?0",
  44. "Connection" => "close",
  45. }.freeze
  46. class CookieJar
  47. def initialize; @h = {}; end
  48. def for(host)
  49. @h[host] || ""
  50. end
  51. def merge_from(resp, host)
  52. cookies = resp.get_fields("Set-Cookie") || []
  53. return if cookies.empty?
  54. existing = parse_header(@h[host])
  55. cookies.each do |sc|
  56. kv = sc.split(";", 2).first
  57. k, v = kv.split("=", 2)
  58. existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
  59. end
  60. @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
  61. end
  62. private
  63. def parse_header(s)
  64. s.to_s.split(";").map(&:strip).filter_map { |kv|
  65. k, v = kv.split("=", 2)
  66. [k, v] unless k.to_s.empty?
  67. }.to_h
  68. end
  69. end
  70. def http_get(url, jar:, referer: nil, fetch_site: "none")
  71. uri = URI(url)
  72. hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
  73. hdrs["Referer"] = referer if referer
  74. cookie = jar.for(uri.host)
  75. hdrs["Cookie"] = cookie unless cookie.empty?
  76. limit = 5
  77. code = 0
  78. body = ""
  79. while limit > 0
  80. limit -= 1
  81. redirect_to = nil
  82. req = Net::HTTP::Get.new(uri, hdrs)
  83. Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
  84. resp = http.request(req)
  85. jar.merge_from(resp, uri.host)
  86. code = resp.code.to_i
  87. if [301, 302, 303, 307, 308].include?(code) && resp["location"]
  88. redirect_to = URI.join(uri, resp["location"])
  89. else
  90. body = resp.body.to_s
  91. end
  92. end
  93. if redirect_to
  94. uri = redirect_to
  95. next
  96. end
  97. break
  98. end
  99. [code, body]
  100. rescue StandardError => e
  101. Log.warn "kingisland", "HTTP error for #{url}: #{e.class} #{e.message}"
  102. [0, ""]
  103. end
  104. # ----- Warmup: hit homepage first to get cookies, then fetch planning page -----
  105. jar = CookieJar.new
  106. Log.info "kingisland", "Warming up via homepage..."
  107. code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
  108. Log.info "kingisland", "Homepage: #{code0}"
  109. sleep(0.5)
  110. Log.info "kingisland", "Fetching planning page..."
  111. code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
  112. Log.info "kingisland", "Planning page: #{code1} (#{html.bytesize} bytes)"
  113. if code1 != 200 || html.bytesize < 5_000
  114. Log.warn "kingisland", "Could not fetch planning page (status #{code1}). " \
  115. "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
  116. exit 0
  117. end
  118. if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
  119. Log.warn "kingisland", "Cloudflare challenge returned. " \
  120. "King Island DAs are also available via planbuild.rb (council code KIS -> da_kingisland)."
  121. exit 0
  122. end
  123. # ----- Parse -----
  124. # Ref format: DA 2025/28 (year/sequential)
  125. REF_RX = /\bDA\s*\d{4}\/\d{1,4}\b/i
  126. doc = Nokogiri::HTML(html)
  127. # The advertised applications are inside div#accordion-1-c4.
  128. # If the div id ever changes, fall back to finding the h2 by text.
  129. section = doc.at_css("div#accordion-1-c4") ||
  130. doc.xpath('//h2[contains(translate(., "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "advertised development")]')&.parent
  131. unless section
  132. Log.warn "kingisland", "Could not find 'Advertised development applications' section on page."
  133. puts "Done #{TABLE}. Saved 0 item(s)."
  134. exit 0
  135. end
  136. paragraphs = section.css("p").to_a
  137. saved = 0
  138. paragraphs.each_with_index do |para, idx|
  139. text = para.text.gsub(/[[:space:]]+/, " ").strip
  140. next unless (m = text.match(REF_RX))
  141. ref = m[0].strip.gsub(/\s+/, " ")
  142. # Strip any "Notice of Planning Application" prefix and the ref itself,
  143. # leaving "ADDRESS – DESCRIPTION."
  144. rest = text
  145. .sub(/Notice\s+of\s+Planning\s+Application\s*[-\u2013\u2014]?\s*/i, "")
  146. .sub(ref, "")
  147. .gsub(/\A[\s\-\u2013\u2014]+/, "")
  148. .gsub(/[.\s]+\z/, "")
  149. # Split at last " – " (en-dash) or " - " to separate address from description
  150. if (split_idx = rest.rindex(/\s[\-\u2013\u2014]\s/))
  151. address = rest[0, split_idx].strip
  152. description = rest[(split_idx + 1)..]&.gsub(/\A[\s\-\u2013\u2014]+/, "")&.strip
  153. else
  154. address = rest.strip
  155. description = "Development Application"
  156. end
  157. next if address.empty?
  158. # Scan forward up to 5 paragraphs for closing date and PDF link
  159. on_notice_to_raw = ""
  160. on_notice_to = nil
  161. doc_url = nil
  162. (1..5).each do |offset|
  163. break if idx + offset >= paragraphs.length
  164. fwd = paragraphs[idx + offset]
  165. fwd_text = fwd.text.gsub(/[[:space:]]+/, " ").strip
  166. if on_notice_to_raw.empty? && fwd_text =~ /no\s+later\s+than|representations|closing/i
  167. if (dm = fwd_text.match(/\b(\d{1,2})\s+([A-Za-z]{3,})\s+(\d{4})\b/))
  168. on_notice_to_raw = "#{dm[1]} #{dm[2]} #{dm[3]}"
  169. on_notice_to = Util.parse_aus_date(on_notice_to_raw)
  170. end
  171. end
  172. if doc_url.nil?
  173. a = fwd.at_css("a[href]")
  174. if a && a["href"].to_s =~ /\.pdf/i
  175. doc_url = a["href"].strip
  176. end
  177. end
  178. end
  179. begin
  180. DB.upsert(TABLE, {
  181. council_reference: ref,
  182. address: address[0, 255],
  183. description: description.to_s,
  184. date_received: nil,
  185. date_received_raw: "",
  186. on_notice_to: on_notice_to,
  187. on_notice_to_raw: on_notice_to_raw,
  188. document_url: doc_url,
  189. applicant: "",
  190. owner: ""
  191. })
  192. enrich_after_upsert!(
  193. table: TABLE,
  194. council_reference: ref,
  195. address: address
  196. )
  197. Log.info "kingisland", "Upserted #{ref} -> #{address}"
  198. saved += 1
  199. rescue StandardError => e
  200. Log.warn "kingisland", "DB error for #{ref}: #{e.class} #{e.message}"
  201. end
  202. end
  203. puts "Done #{TABLE}. Saved #{saved} item(s)."