enrich.rb 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # tools/enrich.rb
  2. # Enrich DA rows AFTER scrapers:
  3. # - Geocode (address_std, street/locality/state/postcode, lat/lng)
  4. # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
  5. #
  6. # Usage examples:
  7. # docker compose run --rm \
  8. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  9. # -e LOOKUP_URL="http://web/list_lookup.php" \
  10. # scraper ruby /app/tools/enrich.rb
  11. #
  12. # # Single table, slower throttle, dry run:
  13. # docker compose run --rm \
  14. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  15. # -e LOOKUP_URL="http://web/list_lookup.php" \
  16. # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
  17. # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
  18. # -e DRY_RUN=1 \
  19. # scraper ruby /app/tools/enrich.rb --table=da_dorset
  20. # lib/enrich.rb
  21. require "json"
  22. require "net/http"
  23. require "uri"
  24. require_relative "./db"
  25. require_relative "./util"
  26. require_relative "./geocode"
  27. LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
  28. ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
  29. def log_enrich(msg)
  30. puts msg if ENRICH_DEBUG
  31. end
  32. def ensure_extra_columns!(table)
  33. DB.validate_table_name!(table)
  34. esc = DB.client.escape(table)
  35. {
  36. "address_std" => "VARCHAR(255) NULL",
  37. "lat" => "DOUBLE NULL",
  38. "lng" => "DOUBLE NULL",
  39. "property_id" => "VARCHAR(50) NULL",
  40. "title_reference" => "VARCHAR(80) NULL",
  41. "document_url" => "TEXT NULL",
  42. "local_document_url" => "TEXT NULL",
  43. "on_notice_to" => "DATE NULL",
  44. "on_notice_to_raw" => "VARCHAR(80) NULL"
  45. }.each do |col, defn|
  46. DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
  47. rescue Mysql2::Error => e
  48. warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
  49. end
  50. end
  51. def http_post_json(url, payload, timeout: 15)
  52. uri = URI.parse(url)
  53. http = Net::HTTP.new(uri.host, uri.port)
  54. http.use_ssl = (uri.scheme == "https")
  55. http.read_timeout = timeout
  56. http.open_timeout = timeout
  57. req = Net::HTTP::Post.new(uri.request_uri)
  58. req["Content-Type"] = "application/json"
  59. req.body = JSON.generate(payload)
  60. res = http.request(req)
  61. raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
  62. JSON.parse(res.body)
  63. rescue JSON::ParserError
  64. {}
  65. end
  66. # Call this right after DB.upsert in each scraper
  67. # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
  68. def enrich_after_upsert!(table:, council_reference:, address:)
  69. DB.validate_table_name!(table)
  70. esc = DB.client.escape(table)
  71. sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
  72. row = sel.execute(council_reference, address).first
  73. return unless row
  74. # 1) Geocode if missing lat/lng or std address
  75. if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
  76. begin
  77. geo = Geocode.format_au(row["address"])
  78. Geocode.update_da_row!(
  79. table: table,
  80. council_reference: council_reference,
  81. orig_address: row["address"],
  82. geo: geo
  83. )
  84. log_enrich("enrich: geocoded #{table} #{council_reference}")
  85. # refresh row to fetch lat/lng for next step
  86. row = sel.execute(council_reference, address).first
  87. rescue => e
  88. warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
  89. end
  90. end
  91. # 2) LIST lookup only if we have coords and something’s missing
  92. need_pid = row["property_id"].to_s.strip.empty?
  93. need_title = row["title_reference"].to_s.strip.empty?
  94. if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
  95. begin
  96. resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
  97. if resp["ok"]
  98. pid = (resp["pid"] || "").to_s
  99. title = (resp["title_id"] || "").to_s
  100. upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
  101. upd.execute(pid, title, council_reference, address)
  102. log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
  103. else
  104. warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
  105. end
  106. rescue => e
  107. warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
  108. end
  109. end
  110. end