derwentvalley.rb 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. # Derwent Valley Council — Development Applications being advertised
  2. #
  3. # Source: https://www.derwentvalley.tas.gov.au/home/latest-news?f.News+category...=Public+Notice
  4. #
  5. # The site is Cloudflare-protected — requires homepage warmup with browser-like
  6. # headers (same technique as burnie.rb / kingisland.rb).
  7. #
  8. # The news listing links go through lgasa-search.lga.sa.gov.au → squiz.cloud.
  9. # Rather than following that full redirect chain, we extract the `index_url`
  10. # parameter from each lgasa href, which points at lgasa-web.squiz.cloud/?a=ID.
  11. # A single (non-following) GET to that URL returns a Location header with the
  12. # real derwentvalley.tas.gov.au detail page URL.
  13. #
  14. # Detail page structure:
  15. # <table><tbody>
  16. # <tr><td>APP No.</td><td>SITE</td><td>PROPOSAL</td></tr>
  17. # <tr><td>DA 2026/023</td><td>160 Wyre Forest Road, Molesworth</td>
  18. # <td>Dwelling and outbuilding</td></tr>
  19. # </tbody></table>
  20. # <p>...received no later than 5.00pm on 15 April 2026...</p>
  21. # <div class="content-container"><a href="...DA-2026-023.pdf">plans</a></div>
  22. require "date"
  23. require "nokogiri"
  24. require "net/http"
  25. require "uri"
  26. require_relative "../lib/db"
  27. require_relative "../lib/enrich"
  28. require_relative "../lib/log"
  29. require_relative "../lib/util"
  30. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_derwentvalley
  31. BASE_URL = "https://www.derwentvalley.tas.gov.au"
  32. NEWS_URL = "#{BASE_URL}/home/latest-news?f.News+category%7CnewsCategory=Public+Notice"
  33. DB.ensure_table!(TABLE)
  34. # ----- Browser-like headers (WAF/Cloudflare warmup) -----
  35. BASE_HEADERS = {
  36. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
  37. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  38. "Accept-Language" => "en-AU,en;q=0.9",
  39. "Accept-Encoding" => "identity",
  40. "Upgrade-Insecure-Requests" => "1",
  41. "Sec-Fetch-Dest" => "document",
  42. "Sec-Fetch-Mode" => "navigate",
  43. "Sec-Fetch-Site" => "none",
  44. "Sec-Fetch-User" => "?1",
  45. "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
  46. "sec-ch-ua-mobile" => "?0",
  47. "sec-ch-ua-platform" => '"Windows"',
  48. "Connection" => "close",
  49. }.freeze
  50. class CookieJar
  51. def initialize; @h = {}; end
  52. def for(host)
  53. @h[host] || ""
  54. end
  55. def merge_from(resp, host)
  56. cookies = resp.get_fields("Set-Cookie") || []
  57. return if cookies.empty?
  58. existing = parse_header(@h[host])
  59. cookies.each do |sc|
  60. kv = sc.split(";", 2).first
  61. k, v = kv.split("=", 2)
  62. existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
  63. end
  64. @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
  65. end
  66. private
  67. def parse_header(s)
  68. s.to_s.split(";").map(&:strip).filter_map { |kv|
  69. k, v = kv.split("=", 2)
  70. [k, v] unless k.to_s.empty?
  71. }.to_h
  72. end
  73. end
  74. # GET url; follow redirects; return [final_url_string, body, http_code]
  75. def http_get(url, jar:, referer: nil, fetch_site: "none", follow: true)
  76. uri = URI(url)
  77. hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
  78. hdrs["Referer"] = referer if referer
  79. cookie = jar.for(uri.host)
  80. hdrs["Cookie"] = cookie unless cookie.empty?
  81. limit = 8
  82. code = 0
  83. body = ""
  84. while limit > 0
  85. limit -= 1
  86. redirect_to = nil
  87. req = Net::HTTP::Get.new(uri, hdrs)
  88. Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https",
  89. read_timeout: 30, open_timeout: 15) do |http|
  90. resp = http.request(req)
  91. jar.merge_from(resp, uri.host)
  92. code = resp.code.to_i
  93. if follow && [301, 302, 303, 307, 308].include?(code) && resp["location"]
  94. redirect_to = URI.join(uri, resp["location"])
  95. else
  96. body = resp.body.to_s
  97. end
  98. end
  99. if redirect_to
  100. uri = redirect_to
  101. # Update Referer and Sec-Fetch-Site for subsequent hops
  102. hdrs["Referer"] = uri.to_s
  103. hdrs["Sec-Fetch-Site"] = "same-origin"
  104. cookie = jar.for(uri.host)
  105. hdrs["Cookie"] = cookie.empty? ? nil : cookie
  106. next
  107. end
  108. break
  109. end
  110. [uri.to_s, body, code]
  111. rescue StandardError => e
  112. Log.warn "derwentvalley", "HTTP error for #{url}: #{e.class} #{e.message}"
  113. [url, "", 0]
  114. end
  115. # Resolve lgasa redirect href -> real derwentvalley.tas.gov.au URL.
  116. # Extracts index_url from the lgasa query string, then makes a non-following
  117. # GET to squiz.cloud/?a=ID and reads the Location header.
  118. def resolve_detail_url(lgasa_href)
  119. query = URI.decode_www_form(URI(lgasa_href).query.to_s).to_h
  120. index_url = query["index_url"]
  121. return nil if index_url.to_s.empty?
  122. uri = URI(index_url)
  123. req = Net::HTTP::Get.new(uri, "User-Agent" => BASE_HEADERS["User-Agent"])
  124. Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https",
  125. open_timeout: 10, read_timeout: 10) do |http|
  126. resp = http.request(req)
  127. loc = resp["location"].to_s
  128. return loc unless loc.empty?
  129. end
  130. nil
  131. rescue StandardError => e
  132. Log.warn "derwentvalley", "Could not resolve squiz redirect #{index_url}: #{e.class} #{e.message}"
  133. nil
  134. end
  135. # Parse a detail page for DA data
  136. def parse_detail(html, page_url)
  137. doc = Nokogiri::HTML(html)
  138. # Table: header row "APP No. | SITE | PROPOSAL", then data rows
  139. data_rows = []
  140. doc.css("table").each do |tbl|
  141. tbl.css("tr").each do |tr|
  142. cells = tr.css("td").map { |td| td.text.gsub(/\u00a0|\s+/, " ").strip }
  143. next if cells.empty?
  144. next if cells.join =~ /\AAPP\s*No\.?/i # skip header row
  145. next unless cells[0] =~ /\bDA\s*\d{4}\/\d+/i
  146. data_rows << cells
  147. end
  148. end
  149. return [] if data_rows.empty?
  150. body_text = doc.css(".content-container, main").first&.text.to_s
  151. .gsub(/\u00a0/, " ").gsub(/\s+/, " ")
  152. # Closing date from "received no later than ... DATE"
  153. on_notice_to_raw = ""
  154. on_notice_to = nil
  155. if (m = body_text.match(/no\s+later\s+than\b.{0,60}?(\d{1,2}\s+[A-Za-z]{3,}\s+\d{4})/i))
  156. on_notice_to_raw = m[1].strip
  157. on_notice_to = Util.parse_aus_date(on_notice_to_raw)
  158. end
  159. # Commencing / start date → date_received
  160. date_received_raw = ""
  161. date_received = nil
  162. if (m = body_text.match(/commencing\s+on\s+(\d{1,2}\s+[A-Za-z]{3,}\s+\d{4})/i)) ||
  163. (m = body_text.match(/Start\s+Date\s+(\d{1,2}\/\d{1,2}\/\d{2,4})/i))
  164. date_received_raw = m[1].strip
  165. date_received = Util.parse_aus_date(date_received_raw)
  166. end
  167. # PDF link within the content area only
  168. doc_url = nil
  169. doc.css(".content-container a[href]").each do |a|
  170. href = a["href"].to_s
  171. if href =~ /\.pdf/i && href.include?("derwentvalley")
  172. doc_url = href
  173. break
  174. end
  175. end
  176. data_rows.map do |cells|
  177. ref = cells[0].to_s.gsub(/\A\s+/, "").strip
  178. address = cells[1].to_s.strip
  179. description = cells[2].to_s.strip
  180. description = "Development Application" if description.empty?
  181. next if ref.empty? || address.empty?
  182. {
  183. council_reference: ref,
  184. address: address,
  185. description: description,
  186. date_received: date_received,
  187. date_received_raw: date_received_raw,
  188. on_notice_to: on_notice_to,
  189. on_notice_to_raw: on_notice_to_raw,
  190. document_url: doc_url
  191. }
  192. end.compact
  193. end
  194. # ----- Warmup then fetch news listing -----
  195. jar = CookieJar.new
  196. Log.info "derwentvalley", "Warming up via homepage..."
  197. _url0, _body0, code0 = http_get("#{BASE_URL}/", jar: jar)
  198. Log.info "derwentvalley", "Homepage: #{code0}"
  199. sleep(0.5)
  200. Log.info "derwentvalley", "Fetching news listing..."
  201. _url1, html1, code1 = http_get(NEWS_URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
  202. Log.info "derwentvalley", "News listing: #{code1} (#{html1.bytesize} bytes)"
  203. if code1 != 200 || html1.bytesize < 5_000
  204. Log.warn "derwentvalley", "Could not fetch news listing (status #{code1}). " \
  205. "DAs for this council are also available via planbuild.rb (council code DER)."
  206. puts "Done #{TABLE}. Saved 0 item(s)."
  207. exit 0
  208. end
  209. if html1.include?("Just a moment") || html1.include?("Enable JavaScript and cookies")
  210. Log.warn "derwentvalley", "Cloudflare challenge page returned. " \
  211. "DAs for this council are also available via planbuild.rb (council code DER)."
  212. puts "Done #{TABLE}. Saved 0 item(s)."
  213. exit 0
  214. end
  215. # ----- Extract detail page URLs from news listing -----
  216. list_doc = Nokogiri::HTML(html1)
  217. detail_urls = []
  218. list_doc.css("li.news-listing__item a[href]").each do |a|
  219. href = a["href"].to_s
  220. next unless href.include?("lgasa-search")
  221. detail_url = resolve_detail_url(href)
  222. detail_urls << detail_url if detail_url && !detail_url.empty?
  223. end
  224. detail_urls.uniq!
  225. Log.info "derwentvalley", "Found #{detail_urls.length} detail page(s)"
  226. saved = 0
  227. detail_urls.each do |detail_url|
  228. Log.info "derwentvalley", "Fetching #{detail_url}"
  229. sleep(0.4)
  230. _final_url, html2, code2 = http_get(
  231. detail_url, jar: jar,
  232. referer: NEWS_URL,
  233. fetch_site: "same-origin"
  234. )
  235. if code2 != 200 || html2.bytesize < 5_000
  236. Log.warn "derwentvalley", "Detail page failed (#{code2}): #{detail_url}"
  237. next
  238. end
  239. records = parse_detail(html2, detail_url)
  240. if records.empty?
  241. Log.warn "derwentvalley", "No DA records parsed from #{detail_url}"
  242. next
  243. end
  244. records.each do |r|
  245. begin
  246. DB.upsert(TABLE, {
  247. council_reference: r[:council_reference],
  248. address: r[:address][0, 255],
  249. description: r[:description],
  250. date_received: r[:date_received],
  251. date_received_raw: r[:date_received_raw],
  252. on_notice_to: r[:on_notice_to],
  253. on_notice_to_raw: r[:on_notice_to_raw],
  254. document_url: r[:document_url],
  255. applicant: "",
  256. owner: ""
  257. })
  258. enrich_after_upsert!(
  259. table: TABLE,
  260. council_reference: r[:council_reference],
  261. address: r[:address]
  262. )
  263. Log.info "derwentvalley", "Upserted #{r[:council_reference]} -> #{r[:address]}"
  264. saved += 1
  265. rescue StandardError => e
  266. Log.warn "derwentvalley", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
  267. end
  268. end
  269. end
  270. puts "Done #{TABLE}. Saved #{saved} item(s)."