latrobe.rb 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # Latrobe Council — Planning Applications on Public Exhibition
  2. #
  3. # Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp
  4. #
  5. # Cloudflare is present — requires homepage warmup with browser-like headers
  6. # before the planning page responds (same technique as burnie.rb / kingisland.rb).
  7. #
  8. # Page structure:
  9. # <ul class="generic-list__list">
  10. # <li class="generic-list__item generic-list__file">
  11. # <h3 class="generic-list__title">
  12. # <a href="...pdf">L-DA007/2026 208 Gilbert Street, Latrobe - proposed
  13. # Additional Dwelling (submissions by 21/04/2026) <span>(PDF File, 2.0 MB)</span></a>
  14. # </h3>
  15. # </li>
  16. # </ul>
  17. require "date"
  18. require "nokogiri"
  19. require "net/http"
  20. require "uri"
  21. require_relative "../lib/db"
  22. require_relative "../lib/enrich"
  23. require_relative "../lib/log"
  24. require_relative "../lib/util"
  25. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_latrobe
  26. BASE_URL = "https://www.latrobe.tas.gov.au"
  27. URL = "#{BASE_URL}/services/building-and-planning-services/planningapp"
  28. DB.ensure_table!(TABLE)
  29. # ----- Browser-like headers (WAF/Cloudflare warmup) -----
  30. BASE_HEADERS = {
  31. "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
  32. "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  33. "Accept-Language" => "en-AU,en;q=0.9",
  34. "Accept-Encoding" => "identity",
  35. "Upgrade-Insecure-Requests" => "1",
  36. "Sec-Fetch-Dest" => "document",
  37. "Sec-Fetch-Mode" => "navigate",
  38. "Sec-Fetch-Site" => "none",
  39. "Sec-Fetch-User" => "?1",
  40. "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
  41. "sec-ch-ua-mobile" => "?0",
  42. "sec-ch-ua-platform" => '"Windows"',
  43. "Connection" => "close",
  44. }.freeze
  45. class CookieJar
  46. def initialize; @h = {}; end
  47. def for(host)
  48. @h[host] || ""
  49. end
  50. def merge_from(resp, host)
  51. cookies = resp.get_fields("Set-Cookie") || []
  52. return if cookies.empty?
  53. existing = parse_header(@h[host])
  54. cookies.each do |sc|
  55. kv = sc.split(";", 2).first
  56. k, v = kv.split("=", 2)
  57. existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
  58. end
  59. @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
  60. end
  61. private
  62. def parse_header(s)
  63. s.to_s.split(";").map(&:strip).filter_map { |kv|
  64. k, v = kv.split("=", 2)
  65. [k, v] unless k.to_s.empty?
  66. }.to_h
  67. end
  68. end
  69. def http_get(url, jar:, referer: nil, fetch_site: "none")
  70. uri = URI(url)
  71. hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
  72. hdrs["Referer"] = referer if referer
  73. cookie = jar.for(uri.host)
  74. hdrs["Cookie"] = cookie unless cookie.empty?
  75. limit = 5
  76. code = 0
  77. body = ""
  78. while limit > 0
  79. limit -= 1
  80. redirect_to = nil
  81. req = Net::HTTP::Get.new(uri, hdrs)
  82. Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
  83. resp = http.request(req)
  84. jar.merge_from(resp, uri.host)
  85. code = resp.code.to_i
  86. if [301, 302, 303, 307, 308].include?(code) && resp["location"]
  87. redirect_to = URI.join(uri, resp["location"])
  88. else
  89. body = resp.body.to_s
  90. end
  91. end
  92. if redirect_to
  93. uri = redirect_to
  94. next
  95. end
  96. break
  97. end
  98. [code, body]
  99. rescue StandardError => e
  100. Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}"
  101. [0, ""]
  102. end
  103. # ----- Warmup then fetch -----
  104. jar = CookieJar.new
  105. Log.info "latrobe", "Warming up via homepage..."
  106. code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
  107. Log.info "latrobe", "Homepage: #{code0}"
  108. sleep(0.5)
  109. Log.info "latrobe", "Fetching planning page..."
  110. code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
  111. Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)"
  112. if code1 != 200 || html.bytesize < 1_000
  113. Log.warn "latrobe", "Could not fetch planning page (status #{code1})."
  114. puts "Done #{TABLE}. Saved 0 item(s)."
  115. exit 0
  116. end
  117. if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
  118. Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser."
  119. puts "Done #{TABLE}. Saved 0 item(s)."
  120. exit 0
  121. end
  122. # ----- Parse -----
  123. # Ref format: L-DA007/2026
  124. REF_RX = /\bL-DA\d+\/\d{4}\b/i
  125. doc = Nokogiri::HTML(html)
  126. saved = 0
  127. doc.css("li.generic-list__item h3.generic-list__title a").each do |a|
  128. raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip
  129. next unless (m = raw_text.match(REF_RX))
  130. ref = m[0].strip
  131. # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)"
  132. rest = raw_text.sub(ref, "").strip
  133. # Extract on-notice date: "(submissions by 21/04/2026)"
  134. on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || ""
  135. on_notice_to = Util.parse_aus_date(on_notice_to_raw)
  136. # Remove the "(submissions by ...)" clause
  137. rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip
  138. # Split "ADDRESS - DESCRIPTION" at first " - "
  139. if (split = rest.index(" - "))
  140. address = rest[0, split].strip
  141. description = rest[(split + 3)..].strip
  142. else
  143. address = rest
  144. description = "Development Application"
  145. end
  146. next if address.empty?
  147. doc_url = a["href"].to_s.strip
  148. doc_url = nil if doc_url.empty?
  149. begin
  150. DB.upsert(TABLE, {
  151. council_reference: ref,
  152. address: address[0, 255],
  153. description: description,
  154. date_received: nil,
  155. date_received_raw: "",
  156. on_notice_to: on_notice_to,
  157. on_notice_to_raw: on_notice_to_raw,
  158. document_url: doc_url,
  159. applicant: "",
  160. owner: ""
  161. })
  162. enrich_after_upsert!(
  163. table: TABLE,
  164. council_reference: ref,
  165. address: address
  166. )
  167. Log.info "latrobe", "Upserted #{ref} -> #{address}"
  168. saved += 1
  169. rescue StandardError => e
  170. Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}"
  171. end
  172. end
  173. puts "Done #{TABLE}. Saved #{saved} item(s)."