|
@@ -1,93 +1,210 @@
|
|
|
-# Latrobe Council – PlanBuild "Currently Advertised" scraper
|
|
|
|
|
-
|
|
|
|
|
|
|
+# Latrobe Council — Planning Applications on Public Exhibition
|
|
|
|
|
+#
|
|
|
|
|
+# Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp
|
|
|
|
|
+#
|
|
|
|
|
+# Cloudflare is present — requires homepage warmup with browser-like headers
|
|
|
|
|
+# before the planning page responds (same technique as burnie.rb / kingisland.rb).
|
|
|
|
|
+#
|
|
|
|
|
+# Page structure:
|
|
|
|
|
+# <ul class="generic-list__list">
|
|
|
|
|
+# <li class="generic-list__item generic-list__file">
|
|
|
|
|
+# <h3 class="generic-list__title">
|
|
|
|
|
+# <a href="...pdf">L-DA007/2026 208 Gilbert Street, Latrobe - proposed
|
|
|
|
|
+# Additional Dwelling (submissions by 21/04/2026) <span>(PDF File, 2.0 MB)</span></a>
|
|
|
|
|
+# </h3>
|
|
|
|
|
+# </li>
|
|
|
|
|
+# </ul>
|
|
|
|
|
+
|
|
|
|
|
+require "date"
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
-require_relative "../lib/http"
|
|
|
|
|
|
|
+require "net/http"
|
|
|
|
|
+require "uri"
|
|
|
|
|
+
|
|
|
require_relative "../lib/db"
|
|
require_relative "../lib/db"
|
|
|
-require_relative "../lib/util"
|
|
|
|
|
require_relative "../lib/enrich"
|
|
require_relative "../lib/enrich"
|
|
|
|
|
+require_relative "../lib/log"
|
|
|
|
|
+require_relative "../lib/util"
|
|
|
|
|
|
|
|
-TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
-URL = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search")
|
|
|
|
|
-COUNCIL_NAME = "Latrobe Council"
|
|
|
|
|
-
|
|
|
|
|
-# Safe reference matcher (slashes inside are fine with %r{...})
|
|
|
|
|
-REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
|
|
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_latrobe
|
|
|
|
|
+BASE_URL = "https://www.latrobe.tas.gov.au"
|
|
|
|
|
+URL = "#{BASE_URL}/services/building-and-planning-services/planningapp"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
-def extract_text_between(text, label_regex, stop_regexes)
|
|
|
|
|
- if (m = text.match(label_regex))
|
|
|
|
|
- start = m.end(0)
|
|
|
|
|
- tail = text[start..-1]
|
|
|
|
|
- stop = stop_regexes.map { |r| (tail =~ r) }.compact.min
|
|
|
|
|
- stop ? tail[0...stop].strip : tail.strip
|
|
|
|
|
- end
|
|
|
|
|
|
|
+# ----- Browser-like headers (WAF/Cloudflare warmup) -----
|
|
|
|
|
+BASE_HEADERS = {
|
|
|
|
|
+ "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
|
|
|
|
|
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
|
|
+ "Accept-Language" => "en-AU,en;q=0.9",
|
|
|
|
|
+ "Accept-Encoding" => "identity",
|
|
|
|
|
+ "Upgrade-Insecure-Requests" => "1",
|
|
|
|
|
+ "Sec-Fetch-Dest" => "document",
|
|
|
|
|
+ "Sec-Fetch-Mode" => "navigate",
|
|
|
|
|
+ "Sec-Fetch-Site" => "none",
|
|
|
|
|
+ "Sec-Fetch-User" => "?1",
|
|
|
|
|
+ "sec-ch-ua" => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
|
|
|
|
|
+ "sec-ch-ua-mobile" => "?0",
|
|
|
|
|
+ "sec-ch-ua-platform" => '"Windows"',
|
|
|
|
|
+ "Connection" => "close",
|
|
|
|
|
+}.freeze
|
|
|
|
|
+
|
|
|
|
|
+class CookieJar
|
|
|
|
|
+ def initialize; @h = {}; end
|
|
|
|
|
+
|
|
|
|
|
+ def for(host)
|
|
|
|
|
+ @h[host] || ""
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ def merge_from(resp, host)
|
|
|
|
|
+ cookies = resp.get_fields("Set-Cookie") || []
|
|
|
|
|
+ return if cookies.empty?
|
|
|
|
|
+ existing = parse_header(@h[host])
|
|
|
|
|
+ cookies.each do |sc|
|
|
|
|
|
+ kv = sc.split(";", 2).first
|
|
|
|
|
+ k, v = kv.split("=", 2)
|
|
|
|
|
+ existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
|
|
|
|
|
+ end
|
|
|
|
|
+ @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ private
|
|
|
|
|
+
|
|
|
|
|
+ def parse_header(s)
|
|
|
|
|
+ s.to_s.split(";").map(&:strip).filter_map { |kv|
|
|
|
|
|
+ k, v = kv.split("=", 2)
|
|
|
|
|
+ [k, v] unless k.to_s.empty?
|
|
|
|
|
+ }.to_h
|
|
|
|
|
+ end
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-html = Http.get(URL)
|
|
|
|
|
-doc = Nokogiri::HTML(html)
|
|
|
|
|
|
|
+def http_get(url, jar:, referer: nil, fetch_site: "none")
|
|
|
|
|
+ uri = URI(url)
|
|
|
|
|
+ hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
|
|
|
|
|
+ hdrs["Referer"] = referer if referer
|
|
|
|
|
+ cookie = jar.for(uri.host)
|
|
|
|
|
+ hdrs["Cookie"] = cookie unless cookie.empty?
|
|
|
|
|
+
|
|
|
|
|
+ limit = 5
|
|
|
|
|
+ code = 0
|
|
|
|
|
+ body = ""
|
|
|
|
|
+
|
|
|
|
|
+ while limit > 0
|
|
|
|
|
+ limit -= 1
|
|
|
|
|
+ redirect_to = nil
|
|
|
|
|
+
|
|
|
|
|
+ req = Net::HTTP::Get.new(uri, hdrs)
|
|
|
|
|
+ Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
|
|
|
+ resp = http.request(req)
|
|
|
|
|
+ jar.merge_from(resp, uri.host)
|
|
|
|
|
+ code = resp.code.to_i
|
|
|
|
|
+
|
|
|
|
|
+ if [301, 302, 303, 307, 308].include?(code) && resp["location"]
|
|
|
|
|
+ redirect_to = URI.join(uri, resp["location"])
|
|
|
|
|
+ else
|
|
|
|
|
+ body = resp.body.to_s
|
|
|
|
|
+ end
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ if redirect_to
|
|
|
|
|
+ uri = redirect_to
|
|
|
|
|
+ next
|
|
|
|
|
+ end
|
|
|
|
|
+ break
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ [code, body]
|
|
|
|
|
+rescue StandardError => e
|
|
|
|
|
+ Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}"
|
|
|
|
|
+ [0, ""]
|
|
|
|
|
+end
|
|
|
|
|
|
|
|
-blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
|
|
|
|
|
|
|
+# ----- Warmup then fetch -----
|
|
|
|
|
+jar = CookieJar.new
|
|
|
|
|
+
|
|
|
|
|
+Log.info "latrobe", "Warming up via homepage..."
|
|
|
|
|
+code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
|
|
|
|
|
+Log.info "latrobe", "Homepage: #{code0}"
|
|
|
|
|
+
|
|
|
|
|
+sleep(0.5)
|
|
|
|
|
+
|
|
|
|
|
+Log.info "latrobe", "Fetching planning page..."
|
|
|
|
|
+code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
|
|
|
|
|
+Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)"
|
|
|
|
|
+
|
|
|
|
|
+if code1 != 200 || html.bytesize < 1_000
|
|
|
|
|
+ Log.warn "latrobe", "Could not fetch planning page (status #{code1})."
|
|
|
|
|
+ puts "Done #{TABLE}. Saved 0 item(s)."
|
|
|
|
|
+ exit 0
|
|
|
|
|
+end
|
|
|
|
|
+
|
|
|
|
|
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
|
|
|
|
|
+ Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser."
|
|
|
|
|
+ puts "Done #{TABLE}. Saved 0 item(s)."
|
|
|
|
|
+ exit 0
|
|
|
|
|
+end
|
|
|
|
|
+
|
|
|
|
|
+# ----- Parse -----
|
|
|
|
|
+# Ref format: L-DA007/2026
|
|
|
|
|
+REF_RX = /\bL-DA\d+\/\d{4}\b/i
|
|
|
|
|
+
|
|
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
saved = 0
|
|
saved = 0
|
|
|
|
|
|
|
|
-blocks.each do |blk|
|
|
|
|
|
- text = blk.text.strip.gsub(/\s+/, " ")
|
|
|
|
|
- next unless text.match?(/Application|Reference|Council/i)
|
|
|
|
|
|
|
+doc.css("li.generic-list__item h3.generic-list__title a").each do |a|
|
|
|
|
|
+ raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip
|
|
|
|
|
|
|
|
- address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8")
|
|
|
|
|
- ref_el = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4")
|
|
|
|
|
|
|
+ next unless (m = raw_text.match(REF_RX))
|
|
|
|
|
+ ref = m[0].strip
|
|
|
|
|
|
|
|
- address = address_el&.text&.strip.to_s
|
|
|
|
|
- council_reference = ref_el&.text&.strip.to_s
|
|
|
|
|
|
|
+ # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)"
|
|
|
|
|
+ rest = raw_text.sub(ref, "").strip
|
|
|
|
|
|
|
|
- address = extract_text_between(text, /Address:\s*/i,
|
|
|
|
|
- [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty?
|
|
|
|
|
|
|
+ # Extract on-notice date: "(submissions by 21/04/2026)"
|
|
|
|
|
+ on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || ""
|
|
|
|
|
+ on_notice_to = Util.parse_aus_date(on_notice_to_raw)
|
|
|
|
|
|
|
|
- if council_reference.empty?
|
|
|
|
|
- if (m = text.match(REF_RX))
|
|
|
|
|
- council_reference = m[3].strip
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
|
|
|
|
|
- m[1].strip
|
|
|
|
|
- end
|
|
|
|
|
- next unless council_name&.include?(COUNCIL_NAME)
|
|
|
|
|
-
|
|
|
|
|
- description = extract_text_between(
|
|
|
|
|
- text,
|
|
|
|
|
- /(Type of Work|Proposal|Description):\s*/i,
|
|
|
|
|
- [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/]
|
|
|
|
|
- ) || ""
|
|
|
|
|
-
|
|
|
|
|
- date_received_raw =
|
|
|
|
|
- if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
|
|
|
|
|
- m[2].strip
|
|
|
|
|
|
|
+ # Remove the "(submissions by ...)" clause
|
|
|
|
|
+ rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip
|
|
|
|
|
+
|
|
|
|
|
+ # Split "ADDRESS - DESCRIPTION" at first " - "
|
|
|
|
|
+ if (split = rest.index(" - "))
|
|
|
|
|
+ address = rest[0, split].strip
|
|
|
|
|
+ description = rest[(split + 3)..].strip
|
|
|
else
|
|
else
|
|
|
- ""
|
|
|
|
|
|
|
+ address = rest
|
|
|
|
|
+ description = "Development Application"
|
|
|
|
|
+ end
|
|
|
|
|
+
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
+
|
|
|
|
|
+ doc_url = a["href"].to_s.strip
|
|
|
|
|
+ doc_url = nil if doc_url.empty?
|
|
|
|
|
+
|
|
|
|
|
+ begin
|
|
|
|
|
+ DB.upsert(TABLE, {
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address[0, 255],
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ date_received: nil,
|
|
|
|
|
+ date_received_raw: "",
|
|
|
|
|
+ on_notice_to: on_notice_to,
|
|
|
|
|
+ on_notice_to_raw: on_notice_to_raw,
|
|
|
|
|
+ document_url: doc_url,
|
|
|
|
|
+ applicant: "",
|
|
|
|
|
+ owner: ""
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ enrich_after_upsert!(
|
|
|
|
|
+ table: TABLE,
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ Log.info "latrobe", "Upserted #{ref} -> #{address}"
|
|
|
|
|
+ saved += 1
|
|
|
|
|
+ rescue StandardError => e
|
|
|
|
|
+ Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}"
|
|
|
end
|
|
end
|
|
|
- date_received = Util.parse_aus_date(date_received_raw)
|
|
|
|
|
-
|
|
|
|
|
- next if address.empty? || council_reference.empty?
|
|
|
|
|
-
|
|
|
|
|
- DB.upsert(TABLE, {
|
|
|
|
|
- description: description,
|
|
|
|
|
- date_received: date_received,
|
|
|
|
|
- date_received_raw: date_received_raw,
|
|
|
|
|
- address: address,
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- applicant: "",
|
|
|
|
|
- owner: ""
|
|
|
|
|
- })
|
|
|
|
|
-
|
|
|
|
|
- enrich_after_upsert!(
|
|
|
|
|
- table: TABLE,
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- address: address
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- puts "Upserted #{council_reference} | #{address}"
|
|
|
|
|
- saved += 1
|
|
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
puts "Done #{TABLE}. Saved #{saved} item(s)."
|
|
puts "Done #{TABLE}. Saved #{saved} item(s)."
|