Bläddra i källkod

update to Dorset Scraper

Benjamin Harris 2 månader sedan
förälder
incheckning
ee38435caa
1 ändrade filer med 100 tillägg och 304 borttagningar
  1. 100 304
      scrapers/dorset.rb

+ 100 - 304
scrapers/dorset.rb

@@ -1,340 +1,136 @@
-# scrapers/dorset.rb
+# Dorset Council — Advertised Development Applications
+#
+# Source: https://www.dorset.tas.gov.au/online-development-application-enquiry
+#
+# Page structure — each application is a <p><a href="PDF_URL">text</a></p>:
+#
+#   PLA/2026/22: Residential dwelling and carport addition - Chris Triebe
+#                and Associates Town Planning Services - 13 Gladstone Road
+#                Herrick - Closes 18.04.2026
+#
+# Text format:  REF: DESCRIPTION - APPLICANT - ADDRESS - Closes DD.MM.YYYY
+#
+# Note: the old eServices portal (eservices.dorset.tas.gov.au) is still live
+# and was the previous data source. The council now publishes the advertised
+# list on their main website with direct PDF links, which is simpler to scrape.
+
 require "date"
 require "nokogiri"
 require "uri"
 require "fileutils"
 
-require_relative "../lib/enrich"
-require_relative "../lib/log"
+require_relative "../lib/scraper_helpers"
 require_relative "../lib/util"
-TABLE = ENV.fetch("TABLE_NAME")
-BASE_HTTPS = "https://eservices.dorset.tas.gov.au"
-BASE_HTTP  = "http://eservices.dorset.tas.gov.au"
-
-# Pick one
-LIST_URL = "#{BASE_HTTPS}/eservice/dialog/daEnquiry/currentlyAdvertised.do?function_id=521&nodeNum=19534"
-#LIST_URL = "#{BASE_HTTPS}/eservice/daEnquiry/recentlyDetermined.do?num_days=900&nodeNum=19535"
+require_relative "../lib/log"
 
+TABLE                = ENV.fetch("TABLE_NAME")
+URL                  = "https://www.dorset.tas.gov.au/online-development-application-enquiry"
 DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
 DOWNLOAD_DIR         = ENV["DOWNLOAD_DIR"] || "/app/downloads"
 
 DB.ensure_table!(TABLE)
 
-def abs_url(href)
-  return "" if href.to_s.strip.empty?
-  URI.join(BASE_HTTPS, href).to_s
-rescue URI::InvalidURIError
-  href.to_s
-end
-
-def dorset_get(jar, url)
-  [BASE_HTTPS, BASE_HTTP].each do |base|
-    begin
-      Http.request(URI.parse("#{base}/"), headers: {}, jar: jar)
-      Http.request(URI.parse("#{base}/eservice/"), headers: {}, jar: jar, referer: "#{base}/")
-      tgt = URI.parse(url.sub(%r{\Ahttps?://[^/]+}, base))
-      res = Http.request(tgt, headers: {}, jar: jar, referer: "#{base}/eservice/")
-      if res.is_a?(Net::HTTPRedirection) && res["location"]
-        res = Http.request(URI.join(tgt.to_s, res["location"]), headers: {}, jar: jar, referer: "#{base}/eservice/")
-      end
-      return res if res.is_a?(Net::HTTPSuccess)
-    rescue OpenSSL::SSL::SSLError, EOFError, Errno::ECONNRESET, Net::ReadTimeout, Net::OpenTimeout
-      next
-    end
-  end
-  raise "Dorset fetch failed for #{url}"
-end
-
-def parse_list(html)
-  doc = Nokogiri::HTML(html)
-  out = []
-  doc.css("h4.non_table_headers a").each do |a|
-    address = a.text.to_s.strip
-    href    = a["href"].to_s
-    entry   = a.ancestors("h4").first&.next_element
-
-    description       = ""
-    date_received_raw = ""
-    council_reference = ""
-    applicant         = ""
-    owner             = ""
-
-    if entry
-      entry.css(".rowDataOnly").each do |p|
-        spans = p.css("span")
-        next unless spans.length == 2
-        key = spans[0].text.to_s.strip
-        val = spans[1].text.to_s.strip
-        case key
-        when "Type of Work"     then description       = val
-        when "Date Lodged"      then date_received_raw = val
-        when "Application No."  then council_reference = val
-        when "Applicant"        then applicant         = val
-        when "Owner"            then owner             = val
-        end
-      end
-    end
-
-    lodged_dt = Util.parse_aus_date(date_received_raw)
-    on_to_dt  = lodged_dt ? (lodged_dt + 14) : nil
-
-    out << {
-      address:            address,
-      detail_href:        href,
-      description:        description.empty? ? "Development Application" : description,
-      date_received_raw:  date_received_raw,
-      date_received:      lodged_dt,
-      on_notice_to:       on_to_dt,
-      on_notice_to_raw:   on_to_dt ? on_to_dt.strftime("%Y-%m-%d") : "",
-      council_reference:  council_reference,
-      applicant:          applicant,
-      owner:              owner
-    }
-  end
-  out
-end
-
-def extract_doc_links(detail_html)
-  doc = Nokogiri::HTML(detail_html)
-  links = []
-  t = doc.css('table[summary]').find { |tbl|
-    tbl["summary"].to_s.downcase.include?("electronic document")
-  }
-  if t
-    links += t.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s }
-  end
-  links = doc.css('a[href*="getElectronicDocumentContents.do"]').map { |a| a["href"].to_s } if links.empty?
-  links.map { |h| abs_url(h) }.uniq
-end
-
-# --- new: parse the tasks/milestones table ---
-def parse_tasks(detail_html)
-  doc = Nokogiri::HTML(detail_html)
-  t = doc.css('table[summary]').find { |tbl|
-    tbl["summary"].to_s.downcase.include?("tasks associated")
-  }
-  return [] unless t
-
-  out = []
-  t.css("tr")[1..]&.each do |tr|
-    tds = tr.css("td")
-    next if tds.empty?
-    stage_desc    = tds[1]&.text.to_s.strip
-    opened_raw    = tds[2]&.text.to_s.strip
-    target_raw    = tds[3]&.text.to_s.strip
-    completed_raw = tds[4]&.text.to_s.strip
-    status        = tds[5]&.text.to_s.strip
-
-    out << {
-      stage_description: stage_desc,
-      opened_raw: opened_raw,
-      opened_date: Util.parse_aus_date(opened_raw),
-      target_raw: target_raw,
-      target_date: Util.parse_aus_date(target_raw),
-      completed_raw: completed_raw,
-      completed_date: Util.parse_aus_date(completed_raw),
-      status: status
-    }
-  end
-  out
-end
-
-def ensure_stages_table!(table)
-  tn = "#{table}_stages"
-  DB.client.query(<<~SQL)
-    CREATE TABLE IF NOT EXISTS `#{DB.client.escape(tn)}` (
-      id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
-      council_reference VARCHAR(100) NOT NULL,
-      address VARCHAR(255) NOT NULL,
-      stage_description VARCHAR(255) NOT NULL,
-      opened_date DATE NULL,
-      opened_raw VARCHAR(50) NULL,
-      target_date DATE NULL,
-      target_raw VARCHAR(50) NULL,
-      completed_date DATE NULL,
-      completed_raw VARCHAR(50) NULL,
-      status VARCHAR(100) NULL,
-      created_at DATETIME NOT NULL,
-      updated_at DATETIME NOT NULL,
-      PRIMARY KEY (id),
-      UNIQUE KEY uniq_stage (council_reference, address, stage_description, opened_raw)
-    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
-  SQL
-end
-
-def save_stages(table, ref, addr, stages)
-  return if stages.empty?
-  tn = "#{table}_stages"
-  ensure_stages_table!(table)
-
-  DB.client.prepare("DELETE FROM `#{DB.client.escape(tn)}` WHERE council_reference = ? AND address = ?")
-          .execute(ref, addr)
-
-  ins = DB.client.prepare(<<~SQL)
-    INSERT INTO `#{DB.client.escape(tn)}`
-      (council_reference, address, stage_description,
-       opened_date, opened_raw, target_date, target_raw,
-       completed_date, completed_raw, status, created_at, updated_at)
-    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())
-  SQL
-
-  stages.each do |s|
-    ins.execute(
-      ref, addr, s[:stage_description][0,255],
-      s[:opened_date], s[:opened_raw][0,50],
-      s[:target_date], s[:target_raw][0,50],
-      s[:completed_date], s[:completed_raw][0,50],
-      s[:status][0,100]
-    )
-  end
-end
+REF_RX   = /\bPLA\/\d{4}\/\d+\b/i
+CLOSE_RX = /\bCloses\s+(\d{1,2}[.\-]\d{1,2}[.\-]\d{4})\b/i
 
 def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
 
-def id_from_url(u)
-  uri = URI.parse(u)
-  q   = uri.query.to_s
-  q[/\bid=([^&]+)/, 1] || File.basename(uri.path)
-rescue URI::InvalidURIError
-  nil
-end
+def download_pdf(url, council_reference)
+  return nil if url.to_s.strip.empty?
 
-def filename_from_response(res, fallback_id)
-  cd = res["content-disposition"].to_s
-  if cd =~ /filename\*?=(?:UTF-8''|")?([^\";]+)/
-    return safe_name($1)
-  end
-  base = safe_name(fallback_id || "document")
-  ct = res["content-type"].to_s
-  ext = ct.include?("pdf") ? ".pdf" : ".bin"
-  "#{base}#{ext}"
-end
-
-def download_all(urls, jar, council_reference)
-  return [] if urls.empty?
   dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
   FileUtils.mkdir_p(dir)
-  saved = []
-  first_web_rel = nil
 
-  urls.each_with_index do |u, i|
-    begin
-      res  = dorset_get(jar, u)
-      body = res.body.to_s
-      fid  = id_from_url(u) || "file#{i+1}"
-      name = filename_from_response(res, fid)
-      path = File.join(dir, name)
-      bytes = File.binwrite(path, body)
-      puts "  saved #{path} (#{bytes} bytes)"
-      saved << path
-      first_web_rel ||= "/files/dorset/#{safe_name(council_reference)}/#{File.basename(path)}"
-    rescue StandardError => e
-      Log.warn "scraper", "Download failed for #{u}: #{e.class} #{e.message}"
-    end
-  end
+  fname = safe_name(File.basename(URI.parse(url).path))
+  fname = "document.pdf" if fname.empty?
+  path  = File.join(dir, fname)
 
-  if first_web_rel
-    begin
-      DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET local_document_url = ? WHERE council_reference = ?")
-               .execute(first_web_rel, council_reference)
-    rescue StandardError => e
-      Log.warn "scraper", "Failed to set local_document_url for #{council_reference}: #{e.class} #{e.message}"
-    end
-  end
+  body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
+  File.binwrite(path, body)
+  puts "  saved #{fname} (#{body.bytesize} bytes)"
 
-  saved
+  "/files/dorset/#{safe_name(council_reference)}/#{fname}"
+rescue StandardError => e
+  Log.warn "dorset", "Download failed for #{url}: #{e.class} #{e.message}"
+  nil
 end
 
-puts "Fetching Dorset list…"
-
-jar        = {}
-list_res   = dorset_get(jar, LIST_URL)
-list_html  = list_res.body
-list_items = parse_list(list_html)
-
-puts "Found #{list_items.length} items for #{TABLE}"
-
-list_items.each do |r|
-  next if r[:council_reference].to_s.strip.empty? || r[:address].to_s.strip.empty?
-
-  detail_url  = abs_url(r[:detail_href])
-  doc_urls    = []
-  stages      = []
-  saved_paths = []
+html = Http.get(URL)
+doc  = Nokogiri::HTML(html)
+items = []
 
-  if !detail_url.empty?
-    begin
-      detail_res  = dorset_get(jar, detail_url)
-      detail_html = detail_res.body
+doc.css("p a[href]").each do |a|
+  text = a.text.gsub(/[[:space:]]+/, " ").strip
+  next unless (ref_m = text.match(REF_RX))
 
-      # documents
-      doc_urls    = extract_doc_links(detail_html)
-      saved_paths = DOWNLOAD_ATTACHMENTS ? download_all(doc_urls, jar, r[:council_reference]) : []
+  ref = ref_m[0]
 
-      # stages
-      stages = parse_tasks(detail_html)
+  # Strip "PLA/YYYY/NNN: " prefix
+  remainder = text.sub(/\A#{Regexp.escape(ref)}:\s*/i, "")
 
-      # prefer Advertising/Public Notif dates if they exist
-      if r[:on_notice_to].nil?
-        adv = stages.find { |s| s[:stage_description].downcase.include?("advertising") || s[:stage_description].downcase.include?("public notif") }
-        if adv
-          r[:on_notice_to]     = adv[:completed_date] || adv[:target_date]
-          r[:on_notice_to_raw] = adv[:completed_raw]  || adv[:target_raw]
-        end
-      end
-    rescue StandardError => e
-      Log.warn "scraper", "Detail fetch failed for #{detail_url}: #{e.class} #{e.message}"
-    end
+  # Extract and strip closing date from the end
+  close_raw    = ""
+  on_notice_to = nil
+  if (close_m = remainder.match(CLOSE_RX))
+    close_raw    = close_m[1]
+    on_notice_to = Date.strptime(close_raw, "%d.%m.%Y") rescue nil
+    remainder    = remainder.sub(/\s*-\s*#{Regexp.escape(close_m[0])}\s*\z/i, "").strip
   end
 
-  representative = DOWNLOAD_ATTACHMENTS ? saved_paths.first.to_s : doc_urls.first.to_s
-
-  # geocode
-  geo = nil
-  begin
-    geo = Geocode.format_au(r[:address])
-  rescue StandardError => e
-    Log.warn "scraper", "Geocode error for #{r[:council_reference]}: #{e.class} #{e.message}"
+  # Remaining text: "Description - Applicant - Address"
+  # Split on " - "; last part = address, second-to-last = applicant, rest = description
+  parts = remainder.split(/\s+-\s+/)
+  if parts.length >= 3
+    address     = parts.last.strip
+    applicant   = parts[-2].strip
+    description = parts[0..-3].join(" - ").strip
+  elsif parts.length == 2
+    address     = parts.last.strip
+    applicant   = ""
+    description = parts.first.strip
+  else
+    address     = remainder.strip
+    applicant   = ""
+    description = "Development Application"
   end
-  
-  council_reference = r[:council_reference][0,100]
-  address			= r[:address][0,255]
 
-  # upsert main row
-  DB.upsert(TABLE, {
-    description:        r[:description],
-    date_received:      r[:date_received],
-    date_received_raw:  r[:date_received_raw],
-    on_notice_to:       r[:on_notice_to],
-    on_notice_to_raw:   r[:on_notice_to_raw],
-    address:            address,
-    council_reference:  council_reference,
-    applicant:          r[:applicant],
-    owner:              r[:owner]
-  })
+  next if address.empty?
+  description = "Development Application" if description.empty?
 
-  enrich_after_upsert!(
-  table: TABLE,
-  council_reference: council_reference,
-  address: address
-)
-  
-  tn  = DB.client.escape(TABLE)
-sql = %Q{
-  SELECT address_std, lat, lng
-  FROM `#{tn}`
-  WHERE council_reference = ? AND address = ?
-  LIMIT 1
-}
-begin
-  row = DB.client.prepare(sql).execute(council_reference, address).first
-  puts "  enriched -> #{row ? row.inspect : 'nil'}"
-rescue StandardError => e
-  Log.warn "scraper", "  enriched probe failed: #{e.class} #{e.message}"
-end
+  pdf_url = abs_url(URL, a["href"].to_s.strip)
 
-  puts "Upserted #{r[:council_reference]} -> #{r[:address]}  docs: #{doc_urls.length} saved: #{saved_paths.length} stages: #{stages.length}"
+  items << {
+    council_reference: ref,
+    address:           address,
+    description:       description,
+    applicant:         applicant,
+    on_notice_to:      on_notice_to,
+    on_notice_to_raw:  close_raw,
+    document_url:      pdf_url
+  }
 end
 
+puts "Found #{items.length} item(s) for #{TABLE}"
+
+items.each do |r|
+  local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
+
+  upsert_and_enrich!(
+    table: TABLE,
+    row: {
+      council_reference: r[:council_reference],
+      address:           r[:address],
+      description:       r[:description],
+      applicant:         r[:applicant],
+      on_notice_to:      r[:on_notice_to],
+      on_notice_to_raw:  r[:on_notice_to_raw],
+      owner:             ""
+    },
+    extras: {
+      document_url:       r[:document_url],
+      local_document_url: local_url
+    }
+  )
+end
 
-puts "Done #{TABLE}."
+puts "Done #{TABLE}. Saved #{items.length} item(s)."