Bläddra i källkod

Kentish Updates

Benjamin Harris 2 månader sedan
förälder
incheckning
94d559c84f
2 ändrade filer med 13 tillägg och 2 borttagningar
  1. 2 1
      .claude/settings.local.json
  2. 11 1
      scrapers/kentish.rb

+ 2 - 1
.claude/settings.local.json

@@ -31,7 +31,8 @@
       "Bash(python3 -)",
       "Bash(python3 -)",
       "WebFetch(domain:www.huonvalley.tas.gov.au)",
       "WebFetch(domain:www.huonvalley.tas.gov.au)",
       "Bash(curl -s -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/\")",
       "Bash(curl -s -A \"Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" \"https://www.huonvalley.tas.gov.au/development/planning/advertised-applications/\")",
-      "Bash(python3 -c \" import sys from html.parser import HTMLParser class P\\(HTMLParser\\): def __init__\\(self\\): super\\(\\).__init__\\(\\) self.depth = 0 self.capture = False self.tag = None def handle_starttag\\(self, tag, attrs\\): d = dict\\(attrs\\) cls = d.get\\('class',''\\) if 'accordion' in cls or 'plan-file' in cls: print\\(f'<{tag} class=\\\\\"{cls}\\\\\">'\\) self.capture = True def handle_data\\(self, data\\): if self.capture and data.strip\\(\\): print\\(f' TEXT: {data.strip\\(\\)[:120]}'\\) def handle_endtag\\(self, tag\\): if tag in \\('h2','h3','a','p','div'\\) and self.capture: self.capture = False P\\(\\).feed\\(sys.stdin.read\\(\\)\\) \")"
+      "Bash(python3 -c \" import sys from html.parser import HTMLParser class P\\(HTMLParser\\): def __init__\\(self\\): super\\(\\).__init__\\(\\) self.depth = 0 self.capture = False self.tag = None def handle_starttag\\(self, tag, attrs\\): d = dict\\(attrs\\) cls = d.get\\('class',''\\) if 'accordion' in cls or 'plan-file' in cls: print\\(f'<{tag} class=\\\\\"{cls}\\\\\">'\\) self.capture = True def handle_data\\(self, data\\): if self.capture and data.strip\\(\\): print\\(f' TEXT: {data.strip\\(\\)[:120]}'\\) def handle_endtag\\(self, tag\\): if tag in \\('h2','h3','a','p','div'\\) and self.capture: self.capture = False P\\(\\).feed\\(sys.stdin.read\\(\\)\\) \")",
+      "Bash(curl -s -L -A 'Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36 \\(KHTML, like Gecko\\) Chrome/127.0.0.0 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-AU,en;q=0.9' -H 'Accept-Encoding: gzip, deflate, br' --compressed https://www.kentish.tas.gov.au/services/building-and-planning-services/planningapp)"
     ]
     ]
   }
   }
 }
 }

+ 11 - 1
scrapers/kentish.rb

@@ -130,10 +130,20 @@ end
 begin
 begin
   html = Http.get(URL)
   html = Http.get(URL)
 rescue StandardError => e
 rescue StandardError => e
-  Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
+  Log.warn "kentish", "Failed to fetch #{URL}: #{e.class} #{e.message}"
   exit 1
   exit 1
 end
 end
 
 
+# Kentish Council's site is protected by Cloudflare JS challenge.
+# When blocked, the page title is "Just a moment..." and contains no DA data.
+# Note: Kentish DAs are also published on PlanBuild (council code KEN),
+# so planbuild.rb covers this council independently.
+if html.include?("Just a moment") || html.include?("Enable JavaScript and cookies")
+  Log.warn "kentish", "Site is returning a Cloudflare challenge page — cannot scrape without browser-level JS execution. DAs for this council are available via planbuild.rb (council code KEN)."
+  puts "Done #{TABLE}. Saved 0 item(s) — site blocked by Cloudflare."
+  exit 0
+end
+
 doc = Nokogiri::HTML(html)
 doc = Nokogiri::HTML(html)
 items = parse_document_list(doc, URL)
 items = parse_document_list(doc, URL)