# tools/enrich.rb # Enrich DA rows AFTER scrapers: # - Geocode (address_std, street/locality/state/postcode, lat/lng) # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha) # # Usage examples: # docker compose run --rm \ # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \ # -e LOOKUP_URL="http://web/list_lookup.php" \ # scraper ruby /app/tools/enrich.rb # # # Single table, slower throttle, dry run: # docker compose run --rm \ # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \ # -e LOOKUP_URL="http://web/list_lookup.php" \ # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \ # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \ # -e DRY_RUN=1 \ # scraper ruby /app/tools/enrich.rb --table=da_dorset # lib/enrich.rb require "json" require "net/http" require "uri" require_relative "./db" require_relative "./util" require_relative "./geocode" LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1" def log_enrich(msg) puts msg if ENRICH_DEBUG end def ensure_extra_columns!(table) esc = DB.client.escape(table) { "address_std" => "VARCHAR(255) NULL", "lat" => "DOUBLE NULL", "lng" => "DOUBLE NULL", "property_id" => "VARCHAR(50) NULL", "title_reference" => "VARCHAR(80) NULL", "local_document_url" => "TEXT NULL", "on_notice_to" => "DATE NULL", "on_notice_to_raw" => "VARCHAR(80) NULL" }.each do |col, defn| DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}") rescue Mysql2::Error => e warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}" end end def http_post_json(url, payload, timeout: 15) uri = URI.parse(url) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = (uri.scheme == "https") http.read_timeout = timeout http.open_timeout = timeout req = Net::HTTP::Post.new(uri.request_uri) req["Content-Type"] = "application/json" req.body = JSON.generate(payload) res = http.request(req) raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess) JSON.parse(res.body) rescue JSON::ParserError {} end # Call this right after DB.upsert in each scraper # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address) def enrich_after_upsert!(table:, council_reference:, address:) ensure_extra_columns!(table) esc = DB.client.escape(table) sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1") row = sel.execute(council_reference, address).first return unless row # 1) Geocode if missing lat/lng or std address if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?) begin geo = Geocode.format_au(row["address"]) Geocode.update_da_row!( table: table, council_reference: council_reference, orig_address: row["address"], geo: geo ) log_enrich("enrich: geocoded #{table} #{council_reference}") # refresh row to fetch lat/lng for next step row = sel.execute(council_reference, address).first rescue => e warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}" end end # 2) LIST lookup only if we have coords and something’s missing need_pid = row["property_id"].to_s.strip.empty? need_title = row["title_reference"].to_s.strip.empty? if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title) begin resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] }) if resp["ok"] pid = (resp["pid"] || "").to_s title = (resp["title_id"] || "").to_s upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?") upd.execute(pid, title, council_reference, address) log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}") else warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}" end rescue => e warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}" end end end