|
@@ -1,118 +0,0 @@
|
|
|
-# tools/enrich.rb
|
|
|
|
|
-# Enrich DA rows AFTER scrapers:
|
|
|
|
|
-# - Geocode (address_std, street/locality/state/postcode, lat/lng)
|
|
|
|
|
-# - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
|
|
|
|
|
-#
|
|
|
|
|
-# Usage examples:
|
|
|
|
|
-# docker compose run --rm \
|
|
|
|
|
-# -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
|
|
|
|
|
-# -e LOOKUP_URL="http://web/list_lookup.php" \
|
|
|
|
|
-# scraper ruby /app/tools/enrich.rb
|
|
|
|
|
-#
|
|
|
|
|
-# # Single table, slower throttle, dry run:
|
|
|
|
|
-# docker compose run --rm \
|
|
|
|
|
-# -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
|
|
|
|
|
-# -e LOOKUP_URL="http://web/list_lookup.php" \
|
|
|
|
|
-# -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
|
|
|
|
|
-# -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
|
|
|
|
|
-# -e DRY_RUN=1 \
|
|
|
|
|
-# scraper ruby /app/tools/enrich.rb --table=da_dorset
|
|
|
|
|
-
|
|
|
|
|
-# lib/enrich.rb
|
|
|
|
|
-require "json"
|
|
|
|
|
-require "net/http"
|
|
|
|
|
-require "uri"
|
|
|
|
|
-require_relative "./db"
|
|
|
|
|
-require_relative "./util"
|
|
|
|
|
-require_relative "./geocode"
|
|
|
|
|
-require_relative "./migrate"
|
|
|
|
|
-require_relative "./log"
|
|
|
|
|
-
|
|
|
|
|
-# Ensure all da_* tables have the expected columns before enriching.
|
|
|
|
|
-Migrate.run!
|
|
|
|
|
-
|
|
|
|
|
-LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
|
|
|
|
|
-
|
|
|
|
|
-def ensure_extra_columns!(table)
|
|
|
|
|
- DB.validate_table_name!(table)
|
|
|
|
|
- esc = DB.client.escape(table)
|
|
|
|
|
- {
|
|
|
|
|
- "address_std" => "VARCHAR(255) NULL",
|
|
|
|
|
- "lat" => "DOUBLE NULL",
|
|
|
|
|
- "lng" => "DOUBLE NULL",
|
|
|
|
|
- "property_id" => "VARCHAR(50) NULL",
|
|
|
|
|
- "title_reference" => "VARCHAR(80) NULL",
|
|
|
|
|
- "document_url" => "TEXT NULL",
|
|
|
|
|
- "local_document_url" => "TEXT NULL",
|
|
|
|
|
- "on_notice_to" => "DATE NULL",
|
|
|
|
|
- "on_notice_to_raw" => "VARCHAR(80) NULL"
|
|
|
|
|
- }.each do |col, defn|
|
|
|
|
|
- DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
|
|
|
|
|
- rescue Mysql2::Error => e
|
|
|
|
|
- Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-def http_post_json(url, payload, timeout: 15)
|
|
|
|
|
- uri = URI.parse(url)
|
|
|
|
|
- http = Net::HTTP.new(uri.host, uri.port)
|
|
|
|
|
- http.use_ssl = (uri.scheme == "https")
|
|
|
|
|
- http.read_timeout = timeout
|
|
|
|
|
- http.open_timeout = timeout
|
|
|
|
|
- req = Net::HTTP::Post.new(uri.request_uri)
|
|
|
|
|
- req["Content-Type"] = "application/json"
|
|
|
|
|
- req.body = JSON.generate(payload)
|
|
|
|
|
- res = http.request(req)
|
|
|
|
|
- raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
|
|
|
|
|
- JSON.parse(res.body)
|
|
|
|
|
-rescue JSON::ParserError
|
|
|
|
|
- {}
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-# Call this right after DB.upsert in each scraper
|
|
|
|
|
-# enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
|
|
|
|
|
-def enrich_after_upsert!(table:, council_reference:, address:)
|
|
|
|
|
- DB.validate_table_name!(table)
|
|
|
|
|
- esc = DB.client.escape(table)
|
|
|
|
|
- sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
|
|
|
|
|
- row = sel.execute(council_reference, address).first
|
|
|
|
|
- return unless row
|
|
|
|
|
-
|
|
|
|
|
- # 1) Geocode if missing lat/lng or std address
|
|
|
|
|
- if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
|
|
|
|
|
- begin
|
|
|
|
|
- geo = Geocode.format_au(row["address"])
|
|
|
|
|
- Geocode.update_da_row!(
|
|
|
|
|
- table: table,
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- orig_address: row["address"],
|
|
|
|
|
- geo: geo
|
|
|
|
|
- )
|
|
|
|
|
- Log.debug "enrich", "geocoded #{table} #{council_reference}"
|
|
|
|
|
- # refresh row to fetch lat/lng for next step
|
|
|
|
|
- row = sel.execute(council_reference, address).first
|
|
|
|
|
- rescue => e
|
|
|
|
|
- warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- # 2) LIST lookup only if we have coords and something’s missing
|
|
|
|
|
- need_pid = row["property_id"].to_s.strip.empty?
|
|
|
|
|
- need_title = row["title_reference"].to_s.strip.empty?
|
|
|
|
|
- if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
|
|
|
|
|
- begin
|
|
|
|
|
- resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
|
|
|
|
|
- if resp["ok"]
|
|
|
|
|
- pid = (resp["pid"] || "").to_s
|
|
|
|
|
- title = (resp["title_id"] || "").to_s
|
|
|
|
|
- upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
|
|
|
|
|
- upd.execute(pid, title, council_reference, address)
|
|
|
|
|
- Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
|
|
|
|
|
- else
|
|
|
|
|
- Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
|
|
|
|
|
- end
|
|
|
|
|
- rescue => e
|
|
|
|
|
- Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-end
|
|
|