Benjamin Harris 2 месяцев назад
Родитель
Сommit
d070dcb5dd
2 измененных файлов с 193 добавлено и 73 удалено
  1. 4 1
      .claude/settings.local.json
  2. 189 72
      scrapers/latrobe.rb

+ 4 - 1
.claude/settings.local.json

@@ -47,7 +47,10 @@
       "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o /tmp/ki_plan.html)",
       "Read(//tmp/**)",
       "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o C:/Users/lumion/AppData/Local/Temp/ki_plan.html)",
-      "Read(//c/Users/lumion/AppData/Local/Temp/**)"
+      "Read(//c/Users/lumion/AppData/Local/Temp/**)",
+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp -o C:/Users/lumion/AppData/Local/Temp/latrobe.html)",
+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: none' -H 'Sec-Fetch-User: ?1' -c C:/Users/lumion/AppData/Local/Temp/latrobe_cookies.txt https://www.latrobe.tas.gov.au/ -o /dev/null)",
+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' -H 'Referer: https://www.latrobe.tas.gov.au/' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: same-origin' -b C:/Users/lumion/AppData/Local/Temp/latrobe_cookies.txt https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp -o C:/Users/lumion/AppData/Local/Temp/latrobe2.html)"
     ]
   }
 }

+ 189 - 72
scrapers/latrobe.rb

@@ -1,93 +1,210 @@
-# Latrobe Council – PlanBuild "Currently Advertised" scraper
-
+# Latrobe Council — Planning Applications on Public Exhibition
+#
+# Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp
+#
+# Cloudflare is present — requires homepage warmup with browser-like headers
+# before the planning page responds (same technique as burnie.rb / kingisland.rb).
+#
+# Page structure:
+#   <ul class="generic-list__list">
+#     <li class="generic-list__item generic-list__file">
+#       <h3 class="generic-list__title">
+#         <a href="...pdf">L-DA007/2026 208 Gilbert Street, Latrobe - proposed
+#            Additional Dwelling (submissions by 21/04/2026) <span>(PDF File, 2.0 MB)</span></a>
+#       </h3>
+#     </li>
+#   </ul>
+
+require "date"
 require "nokogiri"
-require_relative "../lib/http"
+require "net/http"
+require "uri"
+
 require_relative "../lib/db"
-require_relative "../lib/util"
 require_relative "../lib/enrich"
+require_relative "../lib/log"
+require_relative "../lib/util"
 
-TABLE        = ENV.fetch("TABLE_NAME")
-URL          = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search")
-COUNCIL_NAME = "Latrobe Council"
-
-# Safe reference matcher (slashes inside are fine with %r{...})
-REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
+TABLE    = ENV.fetch("TABLE_NAME")  # run_all.sh sets from filename: da_latrobe
+BASE_URL = "https://www.latrobe.tas.gov.au"
+URL      = "#{BASE_URL}/services/building-and-planning-services/planningapp"
 
 DB.ensure_table!(TABLE)
 
-def extract_text_between(text, label_regex, stop_regexes)
-  if (m = text.match(label_regex))
-    start = m.end(0)
-    tail  = text[start..-1]
-    stop  = stop_regexes.map { |r| (tail =~ r) }.compact.min
-    stop ? tail[0...stop].strip : tail.strip
-  end
+# ----- Browser-like headers (WAF/Cloudflare warmup) -----
+BASE_HEADERS = {
+    "User-Agent"                => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+    "Accept"                    => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language"           => "en-AU,en;q=0.9",
+    "Accept-Encoding"           => "identity",
+    "Upgrade-Insecure-Requests" => "1",
+    "Sec-Fetch-Dest"            => "document",
+    "Sec-Fetch-Mode"            => "navigate",
+    "Sec-Fetch-Site"            => "none",
+    "Sec-Fetch-User"            => "?1",
+    "sec-ch-ua"                 => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
+    "sec-ch-ua-mobile"          => "?0",
+    "sec-ch-ua-platform"        => '"Windows"',
+    "Connection"                => "close",
+}.freeze
+
+class CookieJar
+    def initialize; @h = {}; end
+
+    def for(host)
+        @h[host] || ""
+    end
+
+    def merge_from(resp, host)
+        cookies = resp.get_fields("Set-Cookie") || []
+        return if cookies.empty?
+        existing = parse_header(@h[host])
+        cookies.each do |sc|
+            kv = sc.split(";", 2).first
+            k, v = kv.split("=", 2)
+            existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
+        end
+        @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
+    end
+
+    private
+
+    def parse_header(s)
+        s.to_s.split(";").map(&:strip).filter_map { |kv|
+            k, v = kv.split("=", 2)
+            [k, v] unless k.to_s.empty?
+        }.to_h
+    end
 end
 
-html = Http.get(URL)
-doc  = Nokogiri::HTML(html)
+def http_get(url, jar:, referer: nil, fetch_site: "none")
+    uri  = URI(url)
+    hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
+    hdrs["Referer"] = referer if referer
+    cookie = jar.for(uri.host)
+    hdrs["Cookie"] = cookie unless cookie.empty?
+
+    limit = 5
+    code  = 0
+    body  = ""
+
+    while limit > 0
+        limit -= 1
+        redirect_to = nil
+
+        req = Net::HTTP::Get.new(uri, hdrs)
+        Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
+            resp = http.request(req)
+            jar.merge_from(resp, uri.host)
+            code = resp.code.to_i
+
+            if [301, 302, 303, 307, 308].include?(code) && resp["location"]
+                redirect_to = URI.join(uri, resp["location"])
+            else
+                body = resp.body.to_s
+            end
+        end
+
+        if redirect_to
+            uri = redirect_to
+            next
+        end
+        break
+    end
+
+    [code, body]
+rescue StandardError => e
+    Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}"
+    [0, ""]
+end
 
-blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
+# ----- Warmup then fetch -----
+jar = CookieJar.new
+
+Log.info "latrobe", "Warming up via homepage..."
+code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
+Log.info "latrobe", "Homepage: #{code0}"
+
+sleep(0.5)
+
+Log.info "latrobe", "Fetching planning page..."
+code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
+Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)"
+
+if code1 != 200 || html.bytesize < 1_000
+    Log.warn "latrobe", "Could not fetch planning page (status #{code1})."
+    puts "Done #{TABLE}. Saved 0 item(s)."
+    exit 0
+end
+
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
+    Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser."
+    puts "Done #{TABLE}. Saved 0 item(s)."
+    exit 0
+end
+
+# ----- Parse -----
+# Ref format: L-DA007/2026
+REF_RX = /\bL-DA\d+\/\d{4}\b/i
+
+doc   = Nokogiri::HTML(html)
 saved = 0
 
-blocks.each do |blk|
-  text = blk.text.strip.gsub(/\s+/, " ")
-  next unless text.match?(/Application|Reference|Council/i)
+doc.css("li.generic-list__item h3.generic-list__title a").each do |a|
+    raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip
 
-  address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8")
-  ref_el     = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4")
+    next unless (m = raw_text.match(REF_RX))
+    ref = m[0].strip
 
-  address = address_el&.text&.strip.to_s
-  council_reference = ref_el&.text&.strip.to_s
+    # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)"
+    rest = raw_text.sub(ref, "").strip
 
-  address = extract_text_between(text, /Address:\s*/i,
-                                 [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty?
+    # Extract on-notice date: "(submissions by 21/04/2026)"
+    on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || ""
+    on_notice_to     = Util.parse_aus_date(on_notice_to_raw)
 
-  if council_reference.empty?
-    if (m = text.match(REF_RX))
-      council_reference = m[3].strip
-    end
-  end
-
-  council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
-    m[1].strip
-  end
-  next unless council_name&.include?(COUNCIL_NAME)
-
-  description = extract_text_between(
-    text,
-    /(Type of Work|Proposal|Description):\s*/i,
-    [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/]
-  ) || ""
-
-  date_received_raw =
-    if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
-      m[2].strip
+    # Remove the "(submissions by ...)" clause
+    rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip
+
+    # Split "ADDRESS - DESCRIPTION" at first " - "
+    if (split = rest.index(" - "))
+        address     = rest[0, split].strip
+        description = rest[(split + 3)..].strip
     else
-      ""
+        address     = rest
+        description = "Development Application"
+    end
+
+    next if address.empty?
+
+    doc_url = a["href"].to_s.strip
+    doc_url = nil if doc_url.empty?
+
+    begin
+        DB.upsert(TABLE, {
+            council_reference: ref,
+            address:           address[0, 255],
+            description:       description,
+            date_received:     nil,
+            date_received_raw: "",
+            on_notice_to:      on_notice_to,
+            on_notice_to_raw:  on_notice_to_raw,
+            document_url:      doc_url,
+            applicant:         "",
+            owner:             ""
+        })
+
+        enrich_after_upsert!(
+            table:             TABLE,
+            council_reference: ref,
+            address:           address
+        )
+
+        Log.info "latrobe", "Upserted #{ref} -> #{address}"
+        saved += 1
+    rescue StandardError => e
+        Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}"
     end
-  date_received = Util.parse_aus_date(date_received_raw)
-
-  next if address.empty? || council_reference.empty?
-
-  DB.upsert(TABLE, {
-    description: description,
-    date_received: date_received,
-    date_received_raw: date_received_raw,
-    address: address,
-    council_reference: council_reference,
-    applicant: "",
-    owner: ""
-  })
-  
-  enrich_after_upsert!(
-    table: TABLE,
-    council_reference: council_reference,
-    address: address
-  )
-
-  puts "Upserted #{council_reference} | #{address}"
-  saved += 1
 end
 
 puts "Done #{TABLE}. Saved #{saved} item(s)."