enrich.rb 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # tools/enrich.rb
  2. # Enrich DA rows AFTER scrapers:
  3. # - Geocode (address_std, street/locality/state/postcode, lat/lng)
  4. # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
  5. #
  6. # Usage examples:
  7. # docker compose run --rm \
  8. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  9. # -e LOOKUP_URL="http://web/list_lookup.php" \
  10. # scraper ruby /app/tools/enrich.rb
  11. #
  12. # # Single table, slower throttle, dry run:
  13. # docker compose run --rm \
  14. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  15. # -e LOOKUP_URL="http://web/list_lookup.php" \
  16. # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
  17. # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
  18. # -e DRY_RUN=1 \
  19. # scraper ruby /app/tools/enrich.rb --table=da_dorset
  20. # lib/enrich.rb
  21. require "json"
  22. require "net/http"
  23. require "uri"
  24. require_relative "./db"
  25. require_relative "./util"
  26. require_relative "./geocode"
  27. LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
  28. ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
  29. def log_enrich(msg)
  30. puts msg if ENRICH_DEBUG
  31. end
  32. def ensure_extra_columns!(table)
  33. esc = DB.client.escape(table)
  34. {
  35. "address_std" => "VARCHAR(255) NULL",
  36. "lat" => "DOUBLE NULL",
  37. "lng" => "DOUBLE NULL",
  38. "property_id" => "VARCHAR(50) NULL",
  39. "title_reference" => "VARCHAR(80) NULL",
  40. "local_document_url" => "TEXT NULL",
  41. "on_notice_to" => "DATE NULL",
  42. "on_notice_to_raw" => "VARCHAR(80) NULL"
  43. }.each do |col, defn|
  44. DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
  45. rescue Mysql2::Error => e
  46. warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
  47. end
  48. end
  49. def http_post_json(url, payload, timeout: 15)
  50. uri = URI.parse(url)
  51. http = Net::HTTP.new(uri.host, uri.port)
  52. http.use_ssl = (uri.scheme == "https")
  53. http.read_timeout = timeout
  54. http.open_timeout = timeout
  55. req = Net::HTTP::Post.new(uri.request_uri)
  56. req["Content-Type"] = "application/json"
  57. req.body = JSON.generate(payload)
  58. res = http.request(req)
  59. raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
  60. JSON.parse(res.body)
  61. rescue JSON::ParserError
  62. {}
  63. end
  64. # Call this right after DB.upsert in each scraper
  65. # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
  66. def enrich_after_upsert!(table:, council_reference:, address:)
  67. ensure_extra_columns!(table)
  68. esc = DB.client.escape(table)
  69. sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
  70. row = sel.execute(council_reference, address).first
  71. return unless row
  72. # 1) Geocode if missing lat/lng or std address
  73. if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
  74. begin
  75. geo = Geocode.format_au(row["address"])
  76. Geocode.update_da_row!(
  77. table: table,
  78. council_reference: council_reference,
  79. orig_address: row["address"],
  80. geo: geo
  81. )
  82. log_enrich("enrich: geocoded #{table} #{council_reference}")
  83. # refresh row to fetch lat/lng for next step
  84. row = sel.execute(council_reference, address).first
  85. rescue => e
  86. warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
  87. end
  88. end
  89. # 2) LIST lookup only if we have coords and something’s missing
  90. need_pid = row["property_id"].to_s.strip.empty?
  91. need_title = row["title_reference"].to_s.strip.empty?
  92. if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
  93. begin
  94. resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
  95. if resp["ok"]
  96. pid = (resp["pid"] || "").to_s
  97. title = (resp["title_id"] || "").to_s
  98. upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
  99. upd.execute(pid, title, council_reference, address)
  100. log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
  101. else
  102. warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
  103. end
  104. rescue => e
  105. warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
  106. end
  107. end
  108. end