Browse Source

Cloudflare Warning

Benjamin Harris 2 months ago
parent
commit
648af0bf73
3 changed files with 48 additions and 4 deletions
  1. 8 1
      .claude/settings.local.json
  2. 17 1
      scrapers/centralhighlands.rb
  3. 23 2
      scrapers/derwentvalley.rb

+ 8 - 1
.claude/settings.local.json

@@ -32,7 +32,14 @@
       "WebFetch(domain:www.huonvalley.tas.gov.au)",
       "WebFetch(domain:www.huonvalley.tas.gov.au)",
       "Bash(curl -s -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/\")",
       "Bash(curl -s -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/\")",
       "Bash(python3 -c \" import sys from html.parser import HTMLParser class P\\(HTMLParser\\): def __init__\\(self\\): super\\(\\).__init__\\(\\) self.depth = 0 self.capture = False self.tag = None def handle_starttag\\(self, tag, attrs\\): d = dict\\(attrs\\) cls = d.get\\('class',''\\) if 'accordion' in cls or 'plan-file' in cls: print\\(f'<{tag} class=\\\\\"{cls}\\\\\">'\\) self.capture = True def handle_data\\(self, data\\): if self.capture and data.strip\\(\\): print\\(f' TEXT: {data.strip\\(\\)[:120]}'\\) def handle_endtag\\(self, tag\\): if tag in \\('h2','h3','a','p','div'\\) and self.capture: self.capture = False P\\(\\).feed\\(sys.stdin.read\\(\\)\\) \")",
       "Bash(python3 -c \" import sys from html.parser import HTMLParser class P\\(HTMLParser\\): def __init__\\(self\\): super\\(\\).__init__\\(\\) self.depth = 0 self.capture = False self.tag = None def handle_starttag\\(self, tag, attrs\\): d = dict\\(attrs\\) cls = d.get\\('class',''\\) if 'accordion' in cls or 'plan-file' in cls: print\\(f'<{tag} class=\\\\\"{cls}\\\\\">'\\) self.capture = True def handle_data\\(self, data\\): if self.capture and data.strip\\(\\): print\\(f' TEXT: {data.strip\\(\\)[:120]}'\\) def handle_endtag\\(self, tag\\): if tag in \\('h2','h3','a','p','div'\\) and self.capture: self.capture = False P\\(\\).feed\\(sys.stdin.read\\(\\)\\) \")",
-      "Bash(curl -s -L -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-AU,en;q=0.9' -H 'Accept-Encoding: gzip, deflate, br' --compressed https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp)"
+      "Bash(curl -s -L -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-AU,en;q=0.9' -H 'Accept-Encoding: gzip, deflate, br' --compressed https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp)",
+      "Bash(curl -sv -L --max-time 15 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' --compressed https://www.burnie.tas.gov.au)",
+      "Bash(curl -s -L --max-time 15 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' --compressed https://www.derwentvalley.tas.gov.au/home/card-listing/development-applications)",
+      "Bash(curl -s -L --max-time 15 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' --compressed https://www.burnie.tas.gov.au/Development/Planning/Permit-applications-on-exhibition)",
+      "Bash(curl -sv -L --max-time 15 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36' --compressed https://www.burnie.tas.gov.au/Development/Planning/Permit-applications-on-exhibition)",
+      "Bash(curl -s --max-time 10 -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36' 'https://www.derwentvalley.tas.gov.au/home/latest-news?f.News+category%7CnewsCategory=Public+Notice')",
+      "Bash(sed -n '80,130p' f:/GIT_REPO/tas_councils/scrapers/derwentvalley.rb)",
+      "Bash(sed -n '50,75p' f:/GIT_REPO/tas_councils/scrapers/centralhighlands.rb)"
     ]
     ]
   }
   }
 }
 }

+ 17 - 1
scrapers/centralhighlands.rb

@@ -52,7 +52,23 @@ def extract_close_raw(text)
   ""
   ""
 end
 end
 
 
-html = Http.get(URL)
+# Central Highlands Council's site has been unreachable (connection timeout).
+# DAs for this council are also published on PlanBuild (council code CEH),
+# so planbuild.rb covers this council independently.
+html = begin
+  Http.get(URL)
+rescue StandardError => e
+  Log.warn "centralhighlands", "Failed to fetch #{URL}: #{e.class} #{e.message}. DAs are available via planbuild.rb (council code CEH)."
+  puts "Done #{TABLE}. Saved 0 item(s) — site unreachable."
+  exit 0
+end
+
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
+  Log.warn "centralhighlands", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code CEH)."
+  puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
+  exit 0
+end
+
 doc  = Nokogiri::HTML(html)
 doc  = Nokogiri::HTML(html)
 
 
 container = doc.at_css("main, .entry-content, article") || doc
 container = doc.at_css("main, .entry-content, article") || doc

+ 23 - 2
scrapers/derwentvalley.rb

@@ -110,18 +110,39 @@ def detail_links_from_news(news_url)
   }.compact.uniq
   }.compact.uniq
 end
 end
 
 
+def cloudflare_blocked?(html)
+  html.to_s.include?("Just a moment") || html.to_s.include?("Enable JavaScript and cookies")
+end
+
 links = []
 links = []
 begin
 begin
   links = detail_links_from_list(LIST_URL)
   links = detail_links_from_list(LIST_URL)
 rescue StandardError => e
 rescue StandardError => e
-  Log.warn "scraper", "List fetch failed, will try news listing: #{e.class} #{e.message}"
+  Log.warn "derwentvalley", "List fetch failed, will try news listing: #{e.class} #{e.message}"
 end
 end
 
 
 if links.empty?
 if links.empty?
   begin
   begin
     links = detail_links_from_news(NEWS_URL)
     links = detail_links_from_news(NEWS_URL)
   rescue StandardError => e
   rescue StandardError => e
-    Log.warn "scraper", "News fetch failed: #{e.class} #{e.message}"
+    Log.warn "derwentvalley", "News fetch failed: #{e.class} #{e.message}"
+  end
+end
+
+# Both URLs return a Cloudflare JS-challenge page (HTTP 200 with challenge HTML).
+# We can't solve this without browser-level JS execution.
+# Derwent Valley DAs are also published on PlanBuild (council code DER),
+# so planbuild.rb covers this council independently.
+if links.empty?
+  begin
+    probe = Http.get(LIST_URL)
+    if cloudflare_blocked?(probe)
+      Log.warn "derwentvalley", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code DER)."
+      puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
+      exit 0
+    end
+  rescue StandardError => e
+    Log.warn "derwentvalley", "Probe fetch failed: #{e.class} #{e.message}"
   end
   end
 end
 end