|
|
@@ -1,54 +1,20 @@
|
|
|
-# tools/enrich.rb
|
|
|
-# Enrich DA rows AFTER scrapers:
|
|
|
-# - Geocode (address_std, street/locality/state/postcode, lat/lng)
|
|
|
-# - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
|
|
|
-#
|
|
|
-# Usage examples:
|
|
|
-# docker compose run --rm \
|
|
|
-# -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
|
|
|
-# -e LOOKUP_URL="http://web/list_lookup.php" \
|
|
|
-# scraper ruby /app/tools/enrich.rb
|
|
|
+# lib/enrich.rb
|
|
|
+# Per-row enrichment called right after each DB.upsert:
|
|
|
+# 1. Geocode (address_std, street/locality/state/postcode, lat/lng) via Google Maps
|
|
|
+# 2. Property lookup (property_id, title_reference) via LOOKUP_URL service
|
|
|
#
|
|
|
-# # Single table, slower throttle, dry run:
|
|
|
-# docker compose run --rm \
|
|
|
-# -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
|
|
|
-# -e LOOKUP_URL="http://web/list_lookup.php" \
|
|
|
-# -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
|
|
|
-# -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
|
|
|
-# -e DRY_RUN=1 \
|
|
|
-# scraper ruby /app/tools/enrich.rb --table=da_dorset
|
|
|
+# Schema is owned by DB.ensure_table! (new tables) and lib/migrate.rb (existing tables).
|
|
|
+# Scrapers only need to call DB.ensure_table! — no separate ensure_extra_columns! required.
|
|
|
|
|
|
-# lib/enrich.rb
|
|
|
require "json"
|
|
|
require "net/http"
|
|
|
require "uri"
|
|
|
require_relative "./db"
|
|
|
-require_relative "./util"
|
|
|
require_relative "./geocode"
|
|
|
require_relative "./log"
|
|
|
|
|
|
LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
|
|
|
|
|
|
-def ensure_extra_columns!(table)
|
|
|
- DB.validate_table_name!(table)
|
|
|
- esc = DB.client.escape(table)
|
|
|
- {
|
|
|
- "address_std" => "VARCHAR(255) NULL",
|
|
|
- "lat" => "DOUBLE NULL",
|
|
|
- "lng" => "DOUBLE NULL",
|
|
|
- "property_id" => "VARCHAR(50) NULL",
|
|
|
- "title_reference" => "VARCHAR(80) NULL",
|
|
|
- "document_url" => "TEXT NULL",
|
|
|
- "local_document_url" => "TEXT NULL",
|
|
|
- "on_notice_to" => "DATE NULL",
|
|
|
- "on_notice_to_raw" => "VARCHAR(80) NULL"
|
|
|
- }.each do |col, defn|
|
|
|
- DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
|
|
|
- rescue Mysql2::Error => e
|
|
|
- Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
|
|
|
- end
|
|
|
-end
|
|
|
-
|
|
|
def http_post_json(url, payload, timeout: 15)
|
|
|
uri = URI.parse(url)
|
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
|
@@ -65,41 +31,43 @@ rescue JSON::ParserError
|
|
|
{}
|
|
|
end
|
|
|
|
|
|
-# Call this right after DB.upsert in each scraper
|
|
|
-# enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
|
|
|
+# Call this right after DB.upsert in each scraper:
|
|
|
+# enrich_after_upsert!(table: TABLE, council_reference: ref, address: addr)
|
|
|
def enrich_after_upsert!(table:, council_reference:, address:)
|
|
|
DB.validate_table_name!(table)
|
|
|
esc = DB.client.escape(table)
|
|
|
- sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
|
|
|
+ sel = DB.client.prepare(
|
|
|
+ "SELECT id, address, address_std, lat, lng, property_id, title_reference " \
|
|
|
+ "FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1"
|
|
|
+ )
|
|
|
row = sel.execute(council_reference, address).first
|
|
|
return unless row
|
|
|
|
|
|
- # 1) Geocode if missing lat/lng or std address
|
|
|
- if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
|
|
|
+ # 1) Geocode if missing lat/lng or normalised address
|
|
|
+ if row["lat"].nil? || row["lng"].nil? || row["address_std"].to_s.strip.empty?
|
|
|
begin
|
|
|
geo = Geocode.format_au(row["address"])
|
|
|
Geocode.update_da_row!(
|
|
|
- table: table,
|
|
|
+ table: table,
|
|
|
council_reference: council_reference,
|
|
|
- orig_address: row["address"],
|
|
|
- geo: geo
|
|
|
+ orig_address: row["address"],
|
|
|
+ geo: geo
|
|
|
)
|
|
|
Log.debug "enrich", "geocoded #{table} #{council_reference}"
|
|
|
- # refresh row to fetch lat/lng for next step
|
|
|
row = sel.execute(council_reference, address).first
|
|
|
- rescue => e
|
|
|
- warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
|
|
|
+ rescue StandardError => e
|
|
|
+ Log.warn "enrich", "geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
|
|
|
end
|
|
|
end
|
|
|
|
|
|
- # 2) LIST lookup only if we have coords and something’s missing
|
|
|
+ # 2) Property lookup — only if coords exist and pid/title are missing
|
|
|
need_pid = row["property_id"].to_s.strip.empty?
|
|
|
need_title = row["title_reference"].to_s.strip.empty?
|
|
|
if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
|
|
|
begin
|
|
|
resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
|
|
|
if resp["ok"]
|
|
|
- pid = (resp["pid"] || "").to_s
|
|
|
+ pid = (resp["pid"] || "").to_s
|
|
|
title = (resp["title_id"] || "").to_s
|
|
|
upd = DB.client.prepare(
|
|
|
"UPDATE `#{esc}` SET " \
|
|
|
@@ -116,7 +84,7 @@ def enrich_after_upsert!(table:, council_reference:, address:)
|
|
|
else
|
|
|
Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
|
|
|
end
|
|
|
- rescue => e
|
|
|
+ rescue StandardError => e
|
|
|
Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
|
|
|
end
|
|
|
end
|