| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- # tools/enrich.rb
- # Enrich DA rows AFTER scrapers:
- # - Geocode (address_std, street/locality/state/postcode, lat/lng)
- # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
- #
- # Usage examples:
- # docker compose run --rm \
- # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
- # -e LOOKUP_URL="http://web/list_lookup.php" \
- # scraper ruby /app/tools/enrich.rb
- #
- # # Single table, slower throttle, dry run:
- # docker compose run --rm \
- # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
- # -e LOOKUP_URL="http://web/list_lookup.php" \
- # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
- # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
- # -e DRY_RUN=1 \
- # scraper ruby /app/tools/enrich.rb --table=da_dorset
- # lib/enrich.rb
- require "json"
- require "net/http"
- require "uri"
- require_relative "./db"
- require_relative "./util"
- require_relative "./geocode"
- LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
- ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
- def log_enrich(msg)
- puts msg if ENRICH_DEBUG
- end
- def ensure_extra_columns!(table)
- DB.validate_table_name!(table)
- esc = DB.client.escape(table)
- {
- "address_std" => "VARCHAR(255) NULL",
- "lat" => "DOUBLE NULL",
- "lng" => "DOUBLE NULL",
- "property_id" => "VARCHAR(50) NULL",
- "title_reference" => "VARCHAR(80) NULL",
- "local_document_url" => "TEXT NULL",
- "on_notice_to" => "DATE NULL",
- "on_notice_to_raw" => "VARCHAR(80) NULL"
- }.each do |col, defn|
- DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
- rescue Mysql2::Error => e
- warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
- end
- end
- def http_post_json(url, payload, timeout: 15)
- uri = URI.parse(url)
- http = Net::HTTP.new(uri.host, uri.port)
- http.use_ssl = (uri.scheme == "https")
- http.read_timeout = timeout
- http.open_timeout = timeout
- req = Net::HTTP::Post.new(uri.request_uri)
- req["Content-Type"] = "application/json"
- req.body = JSON.generate(payload)
- res = http.request(req)
- raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
- JSON.parse(res.body)
- rescue JSON::ParserError
- {}
- end
- # Call this right after DB.upsert in each scraper
- # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
- def enrich_after_upsert!(table:, council_reference:, address:)
- ensure_extra_columns!(table)
- esc = DB.client.escape(table)
- sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
- row = sel.execute(council_reference, address).first
- return unless row
- # 1) Geocode if missing lat/lng or std address
- if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
- begin
- geo = Geocode.format_au(row["address"])
- Geocode.update_da_row!(
- table: table,
- council_reference: council_reference,
- orig_address: row["address"],
- geo: geo
- )
- log_enrich("enrich: geocoded #{table} #{council_reference}")
- # refresh row to fetch lat/lng for next step
- row = sel.execute(council_reference, address).first
- rescue => e
- warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
- end
- end
- # 2) LIST lookup only if we have coords and something’s missing
- need_pid = row["property_id"].to_s.strip.empty?
- need_title = row["title_reference"].to_s.strip.empty?
- if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
- begin
- resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
- if resp["ok"]
- pid = (resp["pid"] || "").to_s
- title = (resp["title_id"] || "").to_s
- upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
- upd.execute(pid, title, council_reference, address)
- log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
- else
- warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
- end
- rescue => e
- warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
- end
- end
- end
|