| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- # lib/enrich.rb
- # Per-row enrichment called right after each DB.upsert:
- # 1. Geocode (address_std, street/locality/state/postcode, lat/lng) via Google Maps
- # 2. Property lookup (property_id, title_reference) via LOOKUP_URL service
- #
- # Schema is owned by DB.ensure_table! (new tables) and lib/migrate.rb (existing tables).
- # Scrapers only need to call DB.ensure_table! — no separate ensure_extra_columns! required.
- require "json"
- require "net/http"
- require "uri"
- require_relative "./db"
- require_relative "./geocode"
- require_relative "./log"
- LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
- def http_post_json(url, payload, timeout: 15)
- uri = URI.parse(url)
- http = Net::HTTP.new(uri.host, uri.port)
- http.use_ssl = (uri.scheme == "https")
- http.read_timeout = timeout
- http.open_timeout = timeout
- req = Net::HTTP::Post.new(uri.request_uri)
- req["Content-Type"] = "application/json"
- req.body = JSON.generate(payload)
- res = http.request(req)
- raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
- JSON.parse(res.body)
- rescue JSON::ParserError
- {}
- end
- # Call this right after DB.upsert in each scraper:
- # enrich_after_upsert!(table: TABLE, council_reference: ref, address: addr)
- def enrich_after_upsert!(table:, council_reference:, address:)
- DB.validate_table_name!(table)
- esc = DB.client.escape(table)
- sel = DB.client.prepare(
- "SELECT id, address, address_std, lat, lng, property_id, title_reference " \
- "FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1"
- )
- row = sel.execute(council_reference, address).first
- return unless row
- # 1) Geocode if missing lat/lng or normalised address
- if row["lat"].nil? || row["lng"].nil? || row["address_std"].to_s.strip.empty?
- begin
- geo = Geocode.format_au(row["address"])
- Geocode.update_da_row!(
- table: table,
- council_reference: council_reference,
- orig_address: row["address"],
- geo: geo
- )
- Log.debug "enrich", "geocoded #{table} #{council_reference}"
- row = sel.execute(council_reference, address).first
- rescue StandardError => e
- Log.warn "enrich", "geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
- end
- end
- # 2) Property lookup — only if coords exist and pid/title are missing
- need_pid = row["property_id"].to_s.strip.empty?
- need_title = row["title_reference"].to_s.strip.empty?
- if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
- begin
- resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
- if resp["ok"]
- pid = (resp["pid"] || "").to_s
- title = (resp["title_id"] || "").to_s
- upd = DB.client.prepare(
- "UPDATE `#{esc}` SET " \
- "property_id = COALESCE(?, property_id), " \
- "title_reference = COALESCE(?, title_reference) " \
- "WHERE council_reference = ? AND address = ?"
- )
- upd.execute(
- pid.empty? ? nil : pid,
- title.empty? ? nil : title,
- council_reference, address
- )
- Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
- else
- Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
- end
- rescue StandardError => e
- Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
- end
- end
- end
|