burnie.rb 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. # Burnie City Council — permit applications on exhibition (robust / WAF-aware + PDF download)
  2. require "date"
  3. require "nokogiri"
  4. require "cgi"
  5. require "fileutils"
  6. require "net/http"
  7. require "uri"
  8. require "zlib"
  9. require "stringio"
  10. require "base64"
  11. require "securerandom"
  12. require_relative "../lib/enrich"
  13. require_relative "../lib/log"
  14. require_relative "../lib/util"
  15. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_burnie
  16. BASE_URL = "https://www.burnie.tas.gov.au"
  17. URL = "#{BASE_URL}/Development/Planning/Permit-applications-on-exhibition"
  18. URL_EN = "#{URL}?oc_lang=en-AU"
  19. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  20. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  21. DB.ensure_table!(TABLE)
  22. # ----- HTTP helpers (browser-y headers + cookie jar + gzip/deflate) -----
  23. UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\
  24. "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
  25. SEC_CH_UA = %q{"Chromium";v="124", "Not.A/Brand";v="24", "Google Chrome";v="124"}
  26. SEC_CH_UA_PLATFORM = %q{"Windows"}
  27. SEC_CH_UA_MOBILE = "?0"
  28. BASE_HEADERS = {
  29. "User-Agent" => UA,
  30. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  31. "Accept-Language" => "en-AU,en;q=0.8",
  32. # Avoid Brotli (Ruby stdlib won't auto-decode it)
  33. "Accept-Encoding" => "gzip,deflate",
  34. "Upgrade-Insecure-Requests" => "1",
  35. "Sec-Fetch-Dest" => "document",
  36. "Sec-Fetch-Mode" => "navigate",
  37. "Sec-Fetch-Site" => "none",
  38. "Sec-Fetch-User" => "?1",
  39. "sec-ch-ua" => SEC_CH_UA,
  40. "sec-ch-ua-platform" => SEC_CH_UA_PLATFORM,
  41. "sec-ch-ua-mobile" => SEC_CH_UA_MOBILE,
  42. "Pragma" => "no-cache",
  43. "Cache-Control" => "no-cache",
  44. "Connection" => "close",
  45. }.freeze
  46. # Very small cookie jar (domain -> cookie string)
  47. class Jar
  48. def initialize; @h = {}; end
  49. def for(host)
  50. @h[host] || ""
  51. end
  52. def merge_from(resp, host)
  53. cookies = resp.get_fields("Set-Cookie") || []
  54. return if cookies.empty?
  55. existing = parse_cookie_header(@h[host])
  56. cookies.each do |sc|
  57. kv = sc.split(";", 2).first
  58. k, v = kv.split("=", 2)
  59. next if k.to_s.empty?
  60. existing[k] = v.to_s
  61. end
  62. @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
  63. end
  64. def parse_cookie_header(s)
  65. s.to_s.split(";").map(&:strip).map { |kv|
  66. k, v = kv.split("=", 2); [k, v]
  67. }.select { |k, _| !k.to_s.empty? }.to_h
  68. end
  69. end
  70. def decompress(body, enc)
  71. return body if body.nil? || body.empty?
  72. if enc.to_s =~ /gzip/i
  73. Zlib::GzipReader.new(StringIO.new(body)).read
  74. elsif enc.to_s =~ /deflate/i
  75. begin
  76. Zlib::Inflate.inflate(body)
  77. rescue Zlib::Error
  78. body
  79. end
  80. else
  81. body
  82. end
  83. rescue Zlib::Error
  84. body
  85. end
  86. def http_get_with_cookies(url, jar:, headers: {}, referer: nil, site_fetch: "none")
  87. uri = URI(url)
  88. hdrs = BASE_HEADERS.merge(headers)
  89. hdrs["Referer"] = referer if referer
  90. hdrs["Sec-Fetch-Site"] = site_fetch
  91. cookie = jar.for(uri.host)
  92. hdrs["Cookie"] = cookie unless cookie.empty?
  93. limit = 5
  94. enc = ""
  95. msg = ""
  96. code = 0
  97. body = ""
  98. while limit > 0
  99. limit -= 1
  100. redirect_to = nil
  101. req = Net::HTTP::Get.new(uri, hdrs)
  102. Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
  103. resp = http.request(req)
  104. jar.merge_from(resp, uri.host)
  105. enc = resp["content-encoding"].to_s
  106. msg = resp.message
  107. code = resp.code.to_i
  108. if [301, 302, 303, 307, 308].include?(code) && resp["location"]
  109. # Flag the redirect so the while loop can retry; `next` here only
  110. # exits the Net::HTTP.start block, not the while loop.
  111. redirect_to = URI.join(uri, resp["location"])
  112. else
  113. # For HTML we decompress; for PDF we only requested gzip/deflate off,
  114. # so this remains identity unless server forces it (we still handle).
  115. body = decompress(resp.body.to_s, enc)
  116. end
  117. end
  118. if redirect_to
  119. uri = redirect_to
  120. next
  121. end
  122. break
  123. end
  124. [code, body, enc, msg]
  125. end
  126. def short_sleep
  127. sleep(0.4 + rand * 0.6)
  128. end
  129. # ----- Burnie-specific parsing helpers -----
  130. REF_RX = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
  131. def extract_ref(text)
  132. if (m = text.to_s.match(REF_RX))
  133. "DA #{m[1]} / #{m[2]}"
  134. end
  135. end
  136. def normalize_ref(text)
  137. extract_ref(text) ||
  138. text.to_s[/\bDA\s*[12]\d{3}\s*\/\s*[A-Za-z0-9\-_.]+\b/i].to_s.gsub(/\s*\/\s*/, " / ").strip
  139. end
  140. def extract_on_notice_date(text)
  141. s = text.to_s.gsub(/\s+/, " ")
  142. if (m = s.match(/\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b/))
  143. m[0]
  144. elsif (m = s.match(/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/))
  145. m[0]
  146. else
  147. ""
  148. end
  149. end
  150. def first_pdf_on_detail(detail_url, jar)
  151. code, html, _enc, _msg = http_get_with_cookies(
  152. detail_url,
  153. jar: jar,
  154. site_fetch: "same-origin",
  155. referer: URL_EN
  156. )
  157. return "" unless code == 200
  158. doc = Nokogiri::HTML(html)
  159. # Prefer explicit doc buttons if present
  160. a = doc.at_css(".hyperlink-button-container a.ext-pdf") ||
  161. doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")
  162. return "" unless a
  163. href = a["href"].to_s
  164. # Percent-encode non-ASCII characters (e.g. en-dash in filename) so URI.join
  165. # doesn't raise URI::InvalidURIError. ASCII-safe characters are left as-is.
  166. href = href.gsub(/[^\x00-\x7F]/) { |c| URI::DEFAULT_PARSER.escape(c) }
  167. URI.join(detail_url, href).to_s
  168. rescue StandardError => e
  169. Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
  170. ""
  171. end
  172. def decode_seamless_viewstate(doc)
  173. b64 = doc.at_css("#__SEAMLESSVIEWSTATE")&.[]("value").to_s
  174. return nil if b64.empty?
  175. raw = Base64.decode64(b64)
  176. html = begin
  177. Zlib::GzipReader.new(StringIO.new(raw)).read
  178. rescue Zlib::Error
  179. raw
  180. end
  181. Nokogiri::HTML(html)
  182. rescue StandardError => e
  183. Log.warn "scraper", "Failed to decode __SEAMLESSVIEWSTATE: #{e.class} #{e.message}"
  184. nil
  185. end
  186. def sanitize_filename(s)
  187. s.to_s.gsub(/[^\w.\-]+/, "_")[0, 180]
  188. end
  189. def save_pdf(document_url, council_reference, jar, referer:)
  190. return if document_url.to_s.strip.empty?
  191. return unless DOWNLOAD_ATTACHMENTS
  192. # Decide filename
  193. url_path = URI.parse(document_url).path rescue "/document.pdf"
  194. base_name = File.basename(url_path)
  195. safe_base = sanitize_filename(base_name)
  196. # Prefix with reference for uniqueness & traceability
  197. prefix = sanitize_filename(council_reference.to_s.gsub(" / ", "-"))
  198. file_name = "#{prefix}__#{safe_base}"
  199. out_dir = File.join(DOWNLOAD_DIR, TABLE)
  200. out_path = File.join(out_dir, file_name)
  201. FileUtils.mkdir_p(out_dir)
  202. code, data, _enc, msg = http_get_with_cookies(
  203. document_url,
  204. jar: jar,
  205. headers: {
  206. # Ask for PDF explicitly
  207. "Accept" => "application/pdf,*/*;q=0.8",
  208. "Accept-Encoding" => "identity" # avoid gzip'd binary when possible
  209. },
  210. referer: referer,
  211. site_fetch: "same-origin"
  212. )
  213. if code == 200 && data && data.bytesize > 0
  214. File.open(out_path, "wb") { |f| f.write(data) }
  215. puts "Saved PDF to #{out_path} (#{data.bytesize} bytes)"
  216. else
  217. Log.warn "scraper", "PDF fetch failed (#{code} #{msg}) for #{document_url}"
  218. end
  219. rescue StandardError => e
  220. Log.warn "scraper", "PDF save error for #{document_url}: #{e.class} #{e.message}"
  221. end
  222. # ----- Warm-up sequence to appease WAF -----
  223. jar = Jar.new
  224. # 1) Direct try
  225. code1, body1, enc1, msg1 = http_get_with_cookies(URL, jar: jar)
  226. puts "List fetch #1: status=#{code1} #{msg1}, enc=#{enc1}, bytes=#{body1.to_s.bytesize}"
  227. html = nil
  228. if code1 == 200 && body1.bytesize > 5_000
  229. html = body1
  230. else
  231. short_sleep
  232. # 2) Language variant (often works)
  233. code2, body2, enc2, msg2 = http_get_with_cookies(
  234. URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/"
  235. )
  236. puts "List fetch #2: status=#{code2} #{msg2}, enc=#{enc2}, bytes=#{body2.to_s.bytesize} (#{URL_EN})"
  237. if code2 == 200 && body2.bytesize > 5_000
  238. html = body2
  239. else
  240. # 3) Warm up by hitting Home (sets benign cookies), then websitesettings.js, then retry
  241. short_sleep
  242. h_code, _h_body, _h_enc, h_msg = http_get_with_cookies("#{BASE_URL}/Home", jar: jar, site_fetch: "none")
  243. puts "Warmup Home: status=#{h_code} #{h_msg}"
  244. short_sleep
  245. oc_api = "#{BASE_URL}/ocapi/0ff2db3d-0235-40e2-b373-42294eee3a55/en-AU/websitesettings.js"
  246. w_code, _w_body, _w_enc, w_msg = http_get_with_cookies(oc_api, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home")
  247. puts "Warmup websitesettings.js: status=#{w_code} #{w_msg}"
  248. short_sleep
  249. code3, body3, enc3, msg3 = http_get_with_cookies(URL_EN, jar: jar, site_fetch: "same-origin", referer: "#{BASE_URL}/Home")
  250. puts "List fetch #3: status=#{code3} #{msg3}, enc=#{enc3}, bytes=#{body3.to_s.bytesize} (retry with referer)"
  251. html = body3 if code3 == 200 && body3.bytesize > 5_000
  252. end
  253. end
  254. # Fall back to whatever we got first if nothing passed threshold
  255. html ||= body1
  256. puts "Fetched list page (#{html.to_s.bytesize} bytes)"
  257. list_doc = Nokogiri::HTML(html)
  258. # Try visible DOM first
  259. nodes = list_doc.css(".list-container.da-list-container .list-item-container a[href]")
  260. puts "Primary selector found #{nodes.length} anchors"
  261. # If nothing, decode Seamless payload and try again
  262. if nodes.empty?
  263. sv_doc = decode_seamless_viewstate(list_doc)
  264. if sv_doc
  265. nodes = sv_doc.css(".list-container.da-list-container .list-item-container a[href]")
  266. puts "Seamless ViewState selector found #{nodes.length} anchors"
  267. if nodes.empty?
  268. nodes = sv_doc.css(".list-item-container").map { |c| c.at_css("a[href]") }.compact
  269. puts "Seamless final fallback found #{nodes.length} anchors"
  270. end
  271. else
  272. puts "__SEAMLESSVIEWSTATE not found or could not be decoded"
  273. end
  274. end
  275. puts "Found #{nodes.length} application(s) for #{TABLE}"
  276. saved = 0
  277. nodes.each do |a|
  278. detail_url = URI.join(URL, a["href"].to_s).to_s
  279. ref_text = a.at_css(".da-application-number")&.text.to_s
  280. council_reference = normalize_ref(ref_text)
  281. address = a.at_css(".list-item-address")&.text.to_s.strip
  282. closing_text = a.at_css(".display-until-date")&.text.to_s
  283. on_notice_to_raw = if closing_text.empty?
  284. extract_on_notice_date(a.text)
  285. else
  286. extract_on_notice_date(closing_text.sub(/^On display until\s*/i, ""))
  287. end
  288. on_notice_to = Util.parse_aus_date(on_notice_to_raw)
  289. date_received = on_notice_to ? (on_notice_to - 14) : nil
  290. # First <p> that isn't a helper class = description
  291. desc_p = a.css("p").find { |p|
  292. cls = p["class"].to_s
  293. cls.empty? || !(cls =~ /(da-application-number|list-item-address|display-until)/)
  294. }
  295. description = desc_p&.text.to_s.strip
  296. description = "Development Application" if description.empty?
  297. next if address.empty? || council_reference.empty?
  298. document_url = first_pdf_on_detail(detail_url, jar)
  299. # Download the PDF if requested
  300. save_pdf(document_url, council_reference, jar, referer: detail_url) if DOWNLOAD_ATTACHMENTS
  301. DB.upsert(TABLE, {
  302. description: description,
  303. date_received: date_received,
  304. date_received_raw: on_notice_to_raw, # keep the raw on-notice text
  305. on_notice_to: on_notice_to, # store close/on-notice date here
  306. on_notice_to_raw: on_notice_to_raw,
  307. address: address,
  308. council_reference: council_reference,
  309. applicant: "",
  310. owner: ""
  311. })
  312. enrich_after_upsert!(
  313. table: TABLE,
  314. council_reference: council_reference,
  315. address: address
  316. )
  317. begin
  318. upd = DB.client.prepare(
  319. "UPDATE `#{DB.client.escape(TABLE)}` " \
  320. "SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? " \
  321. "WHERE council_reference = ? AND address = ?"
  322. )
  323. title_reference = a.at_css(".list-item-title")&.text&.strip.to_s
  324. upd.execute(document_url, on_notice_to, on_notice_to_raw, title_reference, council_reference, address)
  325. rescue StandardError => e
  326. Log.warn "scraper", "Extra fields update skipped for #{council_reference}: #{e.class} #{e.message}"
  327. end
  328. puts "Upserted #{council_reference} -> #{address}"
  329. saved += 1
  330. end
  331. puts "Done #{TABLE}. Saved #{saved} item(s)."