Răsfoiți Sursa

Update Enrich Scraper

Benjamin Harris 2 luni în urmă
părinte
comite
ccb976617c
4 a modificat fișierele cu 12 adăugiri și 122 ștergeri
  1. 3 1
      .claude/settings.local.json
  2. 9 3
      CLAUDE.md
  3. 0 118
      scrapers/enrich.rb
  4. 0 0
      tools/backfill_pid_title.php

+ 3 - 1
.claude/settings.local.json

@@ -9,7 +9,9 @@
       "Bash(ruby test/test_util.rb)",
       "Bash(docker compose:*)",
       "Bash(python3)",
-      "Bash(python3 -c ':*)"
+      "Bash(python3 -c ':*)",
+      "Bash(grep -n \"FROM \\\\`{\" /f/GIT_REPO/tas_councils/web/index.php)",
+      "Bash(grep -n '\\\\$t\\\\|tableHasColumn\\\\|tableExists' /f/GIT_REPO/tas_councils/web/index.php)"
     ]
   }
 }

+ 9 - 3
CLAUDE.md

@@ -37,10 +37,16 @@ TABLE_NAME=da_brighton DEBUG=1 ruby scrapers/brighton.rb
 # Run a subset
 ONLY=meandervalley,kent docker compose run --rm scraper /app/run_all.sh
 
-# Geocode backfill
+# Geocode backfill (batch, all tables)
 docker compose run --rm \
   -e GOOGLE_MAPS_API_KEY="..." \
-  scraper ruby /app/tools/enrich.rb --table=da_brighton
+  scraper ruby /app/tools/backfill_geocode.rb
+
+# Geocode backfill (single table)
+docker compose run --rm \
+  -e GOOGLE_MAPS_API_KEY="..." \
+  -e ONLY_TABLE=da_brighton \
+  scraper ruby /app/tools/backfill_geocode.rb
 ```
 
 ---
@@ -97,7 +103,7 @@ To add a new scraper, copy a structurally similar one (e.g. `glamorgan.rb` for t
 
 - MariaDB 10.11, `utf8mb4` encoding throughout
 - Schema is created on-the-fly — `CREATE TABLE IF NOT EXISTS` + `ALTER TABLE ... ADD COLUMN IF NOT EXISTS`
-- There is no migration framework; schema changes go in `lib/db.rb` (`ensure_table!`) or `lib/enrich.rb` (`ensure_extra_columns!`)
+- Schema changes go in `lib/migrate.rb` (new migration at end of `MIGRATIONS` array) or `lib/db.rb` (`ensure_table!`) for columns every new table gets
 - The `geo_cache` table stores geocoding results keyed by SHA1 of the normalised query string — avoids redundant Google API calls
 - The `UNIQUE KEY uniq_ref_addr (council_reference, address)` constraint drives the upsert behaviour
 

+ 0 - 118
scrapers/enrich.rb

@@ -1,118 +0,0 @@
-# tools/enrich.rb
-# Enrich DA rows AFTER scrapers:
-#  - Geocode (address_std, street/locality/state/postcode, lat/lng)
-#  - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
-#
-# Usage examples:
-#   docker compose run --rm \
-#     -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
-#     -e LOOKUP_URL="http://web/list_lookup.php" \
-#     scraper ruby /app/tools/enrich.rb
-#
-#   # Single table, slower throttle, dry run:
-#   docker compose run --rm \
-#     -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
-#     -e LOOKUP_URL="http://web/list_lookup.php" \
-#     -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
-#     -e LOOKUP_LIMIT=200  -e LOOKUP_THROTTLE_MS=250 \
-#     -e DRY_RUN=1 \
-#     scraper ruby /app/tools/enrich.rb --table=da_dorset
-
-# lib/enrich.rb
-require "json"
-require "net/http"
-require "uri"
-require_relative "./db"
-require_relative "./util"
-require_relative "./geocode"
-require_relative "./migrate"
-require_relative "./log"
-
-# Ensure all da_* tables have the expected columns before enriching.
-Migrate.run!
-
-LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
-
-def ensure_extra_columns!(table)
-  DB.validate_table_name!(table)
-  esc = DB.client.escape(table)
-  {
-    "address_std"        => "VARCHAR(255) NULL",
-    "lat"                => "DOUBLE NULL",
-    "lng"                => "DOUBLE NULL",
-    "property_id"        => "VARCHAR(50) NULL",
-    "title_reference"    => "VARCHAR(80) NULL",
-    "document_url"       => "TEXT NULL",
-    "local_document_url" => "TEXT NULL",
-    "on_notice_to"       => "DATE NULL",
-    "on_notice_to_raw"   => "VARCHAR(80) NULL"
-  }.each do |col, defn|
-    DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
-  rescue Mysql2::Error => e
-    Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
-  end
-end
-
-def http_post_json(url, payload, timeout: 15)
-  uri = URI.parse(url)
-  http = Net::HTTP.new(uri.host, uri.port)
-  http.use_ssl = (uri.scheme == "https")
-  http.read_timeout = timeout
-  http.open_timeout = timeout
-  req = Net::HTTP::Post.new(uri.request_uri)
-  req["Content-Type"] = "application/json"
-  req.body = JSON.generate(payload)
-  res = http.request(req)
-  raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
-  JSON.parse(res.body)
-rescue JSON::ParserError
-  {}
-end
-
-# Call this right after DB.upsert in each scraper
-#   enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
-def enrich_after_upsert!(table:, council_reference:, address:)
-  DB.validate_table_name!(table)
-  esc = DB.client.escape(table)
-  sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
-  row = sel.execute(council_reference, address).first
-  return unless row
-
-  # 1) Geocode if missing lat/lng or std address
-  if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
-    begin
-      geo = Geocode.format_au(row["address"])
-      Geocode.update_da_row!(
-        table: table,
-        council_reference: council_reference,
-        orig_address: row["address"],
-        geo: geo
-      )
-      Log.debug "enrich", "geocoded #{table} #{council_reference}"
-      # refresh row to fetch lat/lng for next step
-      row = sel.execute(council_reference, address).first
-    rescue => e
-      warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
-    end
-  end
-
-  # 2) LIST lookup only if we have coords and something’s missing
-  need_pid   = row["property_id"].to_s.strip.empty?
-  need_title = row["title_reference"].to_s.strip.empty?
-  if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
-    begin
-      resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
-      if resp["ok"]
-        pid   = (resp["pid"] || "").to_s
-        title = (resp["title_id"] || "").to_s
-        upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
-        upd.execute(pid, title, council_reference, address)
-        Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
-      else
-        Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
-      end
-    rescue => e
-      Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
-    end
-  end
-end

+ 0 - 0
web/backfill_pid_title.php → tools/backfill_pid_title.php