2 месяцев назад · d070dcb5dd
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -47,7 +47,10 @@
 
				       "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o /tmp/ki_plan.html)",
			
 
				       "Read(//tmp/**)",
			
 
				       "Bash(curl -s -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/124.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://kingisland.tas.gov.au/develop/planning/ -o C:/Users/lumion/AppData/Local/Temp/ki_plan.html)",
			
 
				-      "Read(//c/Users/lumion/AppData/Local/Temp/**)"
			
 
				+      "Read(//c/Users/lumion/AppData/Local/Temp/**)",
			
 
				+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp -o C:/Users/lumion/AppData/Local/Temp/latrobe.html)",
			
 
				+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: none' -H 'Sec-Fetch-User: ?1' -c C:/Users/lumion/AppData/Local/Temp/latrobe_cookies.txt https://www.latrobe.tas.gov.au/ -o /dev/null)",
			
 
				+      "Bash(curl -s -L --max-time 30 -H 'User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept-Encoding: identity' -H 'Referer: https://www.latrobe.tas.gov.au/' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: same-origin' -b C:/Users/lumion/AppData/Local/Temp/latrobe_cookies.txt https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp -o C:/Users/lumion/AppData/Local/Temp/latrobe2.html)"
			
 
				     ]
			
 
				   }
			
 
				 }
			
--- a/scrapers/latrobe.rb
+++ b/scrapers/latrobe.rb
@@ -1,93 +1,210 @@
 
				-# Latrobe Council – PlanBuild "Currently Advertised" scraper
			
 
				-
			
 
				+# Latrobe Council — Planning Applications on Public Exhibition
			
 
				+#
			
 
				+# Source: https://www.latrobe.tas.gov.au/services/building-and-planning-services/planningapp
			
 
				+#
			
 
				+# Cloudflare is present — requires homepage warmup with browser-like headers
			
 
				+# before the planning page responds (same technique as burnie.rb / kingisland.rb).
			
 
				+#
			
 
				+# Page structure:
			
 
				+#   <ul class="generic-list__list">
			
 
				+#     <li class="generic-list__item generic-list__file">
			
 
				+#       <h3 class="generic-list__title">
			
 
				+#         <a href="...pdf">L-DA007/2026 208 Gilbert Street, Latrobe - proposed
			
 
				+#            Additional Dwelling (submissions by 21/04/2026) <span>(PDF File, 2.0 MB)</span></a>
			
 
				+#       </h3>
			
 
				+#     </li>
			
 
				+#   </ul>
			
 
				+
			
 
				+require "date"
			
 
				 require "nokogiri"
			
 
				-require_relative "../lib/http"
			
 
				+require "net/http"
			
 
				+require "uri"
			
 
				+
			
 
				 require_relative "../lib/db"
			
 
				-require_relative "../lib/util"
			
 
				 require_relative "../lib/enrich"
			
 
				+require_relative "../lib/log"
			
 
				+require_relative "../lib/util"
			
 
				 
			
 
				-TABLE        = ENV.fetch("TABLE_NAME")
			
 
				-URL          = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search")
			
 
				-COUNCIL_NAME = "Latrobe Council"
			
 
				-
			
 
				-# Safe reference matcher (slashes inside are fine with %r{...})
			
 
				-REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
			
 
				+TABLE    = ENV.fetch("TABLE_NAME")  # run_all.sh sets from filename: da_latrobe
			
 
				+BASE_URL = "https://www.latrobe.tas.gov.au"
			
 
				+URL      = "#{BASE_URL}/services/building-and-planning-services/planningapp"
			
 
				 
			
 
				 DB.ensure_table!(TABLE)
			
 
				 
			
 
				-def extract_text_between(text, label_regex, stop_regexes)
			
 
				-  if (m = text.match(label_regex))
			
 
				-    start = m.end(0)
			
 
				-    tail  = text[start..-1]
			
 
				-    stop  = stop_regexes.map { |r| (tail =~ r) }.compact.min
			
 
				-    stop ? tail[0...stop].strip : tail.strip
			
 
				-  end
			
 
				+# ----- Browser-like headers (WAF/Cloudflare warmup) -----
			
 
				+BASE_HEADERS = {
			
 
				+    "User-Agent"                => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
			
 
				+    "Accept"                    => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			
 
				+    "Accept-Language"           => "en-AU,en;q=0.9",
			
 
				+    "Accept-Encoding"           => "identity",
			
 
				+    "Upgrade-Insecure-Requests" => "1",
			
 
				+    "Sec-Fetch-Dest"            => "document",
			
 
				+    "Sec-Fetch-Mode"            => "navigate",
			
 
				+    "Sec-Fetch-Site"            => "none",
			
 
				+    "Sec-Fetch-User"            => "?1",
			
 
				+    "sec-ch-ua"                 => '"Chromium";v="127", "Not)A;Brand";v="99", "Google Chrome";v="127"',
			
 
				+    "sec-ch-ua-mobile"          => "?0",
			
 
				+    "sec-ch-ua-platform"        => '"Windows"',
			
 
				+    "Connection"                => "close",
			
 
				+}.freeze
			
 
				+
			
 
				+class CookieJar
			
 
				+    def initialize; @h = {}; end
			
 
				+
			
 
				+    def for(host)
			
 
				+        @h[host] || ""
			
 
				+    end
			
 
				+
			
 
				+    def merge_from(resp, host)
			
 
				+        cookies = resp.get_fields("Set-Cookie") || []
			
 
				+        return if cookies.empty?
			
 
				+        existing = parse_header(@h[host])
			
 
				+        cookies.each do |sc|
			
 
				+            kv = sc.split(";", 2).first
			
 
				+            k, v = kv.split("=", 2)
			
 
				+            existing[k.to_s.strip] = v.to_s unless k.to_s.strip.empty?
			
 
				+        end
			
 
				+        @h[host] = existing.map { |k, v| "#{k}=#{v}" }.join("; ")
			
 
				+    end
			
 
				+
			
 
				+    private
			
 
				+
			
 
				+    def parse_header(s)
			
 
				+        s.to_s.split(";").map(&:strip).filter_map { |kv|
			
 
				+            k, v = kv.split("=", 2)
			
 
				+            [k, v] unless k.to_s.empty?
			
 
				+        }.to_h
			
 
				+    end
			
 
				 end
			
 
				 
			
 
				-html = Http.get(URL)
			
 
				-doc  = Nokogiri::HTML(html)
			
 
				+def http_get(url, jar:, referer: nil, fetch_site: "none")
			
 
				+    uri  = URI(url)
			
 
				+    hdrs = BASE_HEADERS.merge("Sec-Fetch-Site" => fetch_site)
			
 
				+    hdrs["Referer"] = referer if referer
			
 
				+    cookie = jar.for(uri.host)
			
 
				+    hdrs["Cookie"] = cookie unless cookie.empty?
			
 
				+
			
 
				+    limit = 5
			
 
				+    code  = 0
			
 
				+    body  = ""
			
 
				+
			
 
				+    while limit > 0
			
 
				+        limit -= 1
			
 
				+        redirect_to = nil
			
 
				+
			
 
				+        req = Net::HTTP::Get.new(uri, hdrs)
			
 
				+        Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http|
			
 
				+            resp = http.request(req)
			
 
				+            jar.merge_from(resp, uri.host)
			
 
				+            code = resp.code.to_i
			
 
				+
			
 
				+            if [301, 302, 303, 307, 308].include?(code) && resp["location"]
			
 
				+                redirect_to = URI.join(uri, resp["location"])
			
 
				+            else
			
 
				+                body = resp.body.to_s
			
 
				+            end
			
 
				+        end
			
 
				+
			
 
				+        if redirect_to
			
 
				+            uri = redirect_to
			
 
				+            next
			
 
				+        end
			
 
				+        break
			
 
				+    end
			
 
				+
			
 
				+    [code, body]
			
 
				+rescue StandardError => e
			
 
				+    Log.warn "latrobe", "HTTP error for #{url}: #{e.class} #{e.message}"
			
 
				+    [0, ""]
			
 
				+end
			
 
				 
			
 
				-blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
			
 
				+# ----- Warmup then fetch -----
			
 
				+jar = CookieJar.new
			
 
				+
			
 
				+Log.info "latrobe", "Warming up via homepage..."
			
 
				+code0, _body0 = http_get("#{BASE_URL}/", jar: jar)
			
 
				+Log.info "latrobe", "Homepage: #{code0}"
			
 
				+
			
 
				+sleep(0.5)
			
 
				+
			
 
				+Log.info "latrobe", "Fetching planning page..."
			
 
				+code1, html = http_get(URL, jar: jar, referer: "#{BASE_URL}/", fetch_site: "same-origin")
			
 
				+Log.info "latrobe", "Planning page: #{code1} (#{html.bytesize} bytes)"
			
 
				+
			
 
				+if code1 != 200 || html.bytesize < 1_000
			
 
				+    Log.warn "latrobe", "Could not fetch planning page (status #{code1})."
			
 
				+    puts "Done #{TABLE}. Saved 0 item(s)."
			
 
				+    exit 0
			
 
				+end
			
 
				+
			
 
				+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
			
 
				+    Log.warn "latrobe", "Cloudflare challenge page returned — cannot scrape without a real browser."
			
 
				+    puts "Done #{TABLE}. Saved 0 item(s)."
			
 
				+    exit 0
			
 
				+end
			
 
				+
			
 
				+# ----- Parse -----
			
 
				+# Ref format: L-DA007/2026
			
 
				+REF_RX = /\bL-DA\d+\/\d{4}\b/i
			
 
				+
			
 
				+doc   = Nokogiri::HTML(html)
			
 
				 saved = 0
			
 
				 
			
 
				-blocks.each do |blk|
			
 
				-  text = blk.text.strip.gsub(/\s+/, " ")
			
 
				-  next unless text.match?(/Application|Reference|Council/i)
			
 
				+doc.css("li.generic-list__item h3.generic-list__title a").each do |a|
			
 
				+    raw_text = a.text.gsub(/\(PDF\s+File[^)]*\)/i, "").gsub(/\s+/, " ").strip
			
 
				 
			
 
				-  address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8")
			
 
				-  ref_el     = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4")
			
 
				+    next unless (m = raw_text.match(REF_RX))
			
 
				+    ref = m[0].strip
			
 
				 
			
 
				-  address = address_el&.text&.strip.to_s
			
 
				-  council_reference = ref_el&.text&.strip.to_s
			
 
				+    # Strip ref from front; remainder: "ADDRESS - DESCRIPTION (submissions by DATE)"
			
 
				+    rest = raw_text.sub(ref, "").strip
			
 
				 
			
 
				-  address = extract_text_between(text, /Address:\s*/i,
			
 
				-                                 [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty?
			
 
				+    # Extract on-notice date: "(submissions by 21/04/2026)"
			
 
				+    on_notice_to_raw = rest[/\(submissions?\s+by\s+([^)]+)\)/i, 1]&.strip || ""
			
 
				+    on_notice_to     = Util.parse_aus_date(on_notice_to_raw)
			
 
				 
			
 
				-  if council_reference.empty?
			
 
				-    if (m = text.match(REF_RX))
			
 
				-      council_reference = m[3].strip
			
 
				-    end
			
 
				-  end
			
 
				-
			
 
				-  council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
			
 
				-    m[1].strip
			
 
				-  end
			
 
				-  next unless council_name&.include?(COUNCIL_NAME)
			
 
				-
			
 
				-  description = extract_text_between(
			
 
				-    text,
			
 
				-    /(Type of Work|Proposal|Description):\s*/i,
			
 
				-    [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/]
			
 
				-  ) || ""
			
 
				-
			
 
				-  date_received_raw =
			
 
				-    if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
			
 
				-      m[2].strip
			
 
				+    # Remove the "(submissions by ...)" clause
			
 
				+    rest = rest.sub(/\s*\(submissions?\s+by\s+[^)]+\)/i, "").strip
			
 
				+
			
 
				+    # Split "ADDRESS - DESCRIPTION" at first " - "
			
 
				+    if (split = rest.index(" - "))
			
 
				+        address     = rest[0, split].strip
			
 
				+        description = rest[(split + 3)..].strip
			
 
				     else
			
 
				-      ""
			
 
				+        address     = rest
			
 
				+        description = "Development Application"
			
 
				+    end
			
 
				+
			
 
				+    next if address.empty?
			
 
				+
			
 
				+    doc_url = a["href"].to_s.strip
			
 
				+    doc_url = nil if doc_url.empty?
			
 
				+
			
 
				+    begin
			
 
				+        DB.upsert(TABLE, {
			
 
				+            council_reference: ref,
			
 
				+            address:           address[0, 255],
			
 
				+            description:       description,
			
 
				+            date_received:     nil,
			
 
				+            date_received_raw: "",
			
 
				+            on_notice_to:      on_notice_to,
			
 
				+            on_notice_to_raw:  on_notice_to_raw,
			
 
				+            document_url:      doc_url,
			
 
				+            applicant:         "",
			
 
				+            owner:             ""
			
 
				+        })
			
 
				+
			
 
				+        enrich_after_upsert!(
			
 
				+            table:             TABLE,
			
 
				+            council_reference: ref,
			
 
				+            address:           address
			
 
				+        )
			
 
				+
			
 
				+        Log.info "latrobe", "Upserted #{ref} -> #{address}"
			
 
				+        saved += 1
			
 
				+    rescue StandardError => e
			
 
				+        Log.warn "latrobe", "DB error for #{ref}: #{e.class} #{e.message}"
			
 
				     end
			
 
				-  date_received = Util.parse_aus_date(date_received_raw)
			
 
				-
			
 
				-  next if address.empty? || council_reference.empty?
			
 
				-
			
 
				-  DB.upsert(TABLE, {
			
 
				-    description: description,
			
 
				-    date_received: date_received,
			
 
				-    date_received_raw: date_received_raw,
			
 
				-    address: address,
			
 
				-    council_reference: council_reference,
			
 
				-    applicant: "",
			
 
				-    owner: ""
			
 
				-  })
			
 
				-  
			
 
				-  enrich_after_upsert!(
			
 
				-    table: TABLE,
			
 
				-    council_reference: council_reference,
			
 
				-    address: address
			
 
				-  )
			
 
				-
			
 
				-  puts "Upserted #{council_reference} | #{address}"
			
 
				-  saved += 1
			
 
				 end
			
 
				 
			
 
				 puts "Done #{TABLE}. Saved #{saved} item(s)."