enrich.rb 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # lib/enrich.rb
  2. # Per-row enrichment called right after each DB.upsert:
  3. # 1. Geocode (address_std, street/locality/state/postcode, lat/lng) via Google Maps
  4. # 2. Property lookup (property_id, title_reference) via LOOKUP_URL service
  5. #
  6. # Schema is owned by DB.ensure_table! (new tables) and lib/migrate.rb (existing tables).
  7. # Scrapers only need to call DB.ensure_table! — no separate ensure_extra_columns! required.
  8. require "json"
  9. require "net/http"
  10. require "uri"
  11. require_relative "./db"
  12. require_relative "./geocode"
  13. require_relative "./log"
  14. LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
  15. def http_post_json(url, payload, timeout: 15)
  16. uri = URI.parse(url)
  17. http = Net::HTTP.new(uri.host, uri.port)
  18. http.use_ssl = (uri.scheme == "https")
  19. http.read_timeout = timeout
  20. http.open_timeout = timeout
  21. req = Net::HTTP::Post.new(uri.request_uri)
  22. req["Content-Type"] = "application/json"
  23. req.body = JSON.generate(payload)
  24. res = http.request(req)
  25. raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
  26. JSON.parse(res.body)
  27. rescue JSON::ParserError
  28. {}
  29. end
  30. # Call this right after DB.upsert in each scraper:
  31. # enrich_after_upsert!(table: TABLE, council_reference: ref, address: addr)
  32. def enrich_after_upsert!(table:, council_reference:, address:)
  33. DB.validate_table_name!(table)
  34. esc = DB.client.escape(table)
  35. sel = DB.client.prepare(
  36. "SELECT id, address, address_std, lat, lng, property_id, title_reference " \
  37. "FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1"
  38. )
  39. row = sel.execute(council_reference, address).first
  40. return unless row
  41. # 1) Geocode if missing lat/lng or normalised address
  42. if row["lat"].nil? || row["lng"].nil? || row["address_std"].to_s.strip.empty?
  43. begin
  44. geo = Geocode.format_au(row["address"])
  45. Geocode.update_da_row!(
  46. table: table,
  47. council_reference: council_reference,
  48. orig_address: row["address"],
  49. geo: geo
  50. )
  51. Log.debug "enrich", "geocoded #{table} #{council_reference}"
  52. row = sel.execute(council_reference, address).first
  53. rescue StandardError => e
  54. Log.warn "enrich", "geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
  55. end
  56. end
  57. # 2) Property lookup — only if coords exist and pid/title are missing
  58. need_pid = row["property_id"].to_s.strip.empty?
  59. need_title = row["title_reference"].to_s.strip.empty?
  60. if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
  61. begin
  62. resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
  63. if resp["ok"]
  64. pid = (resp["pid"] || "").to_s
  65. title = (resp["title_id"] || "").to_s
  66. upd = DB.client.prepare(
  67. "UPDATE `#{esc}` SET " \
  68. "property_id = COALESCE(?, property_id), " \
  69. "title_reference = COALESCE(?, title_reference) " \
  70. "WHERE council_reference = ? AND address = ?"
  71. )
  72. upd.execute(
  73. pid.empty? ? nil : pid,
  74. title.empty? ? nil : title,
  75. council_reference, address
  76. )
  77. Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
  78. else
  79. Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
  80. end
  81. rescue StandardError => e
  82. Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
  83. end
  84. end
  85. end