enrich.rb 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # tools/enrich.rb
  2. # Enrich DA rows AFTER scrapers:
  3. # - Geocode (address_std, street/locality/state/postcode, lat/lng)
  4. # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
  5. #
  6. # Usage examples:
  7. # docker compose run --rm \
  8. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  9. # -e LOOKUP_URL="http://web/list_lookup.php" \
  10. # scraper ruby /app/tools/enrich.rb
  11. #
  12. # # Single table, slower throttle, dry run:
  13. # docker compose run --rm \
  14. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  15. # -e LOOKUP_URL="http://web/list_lookup.php" \
  16. # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
  17. # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
  18. # -e DRY_RUN=1 \
  19. # scraper ruby /app/tools/enrich.rb --table=da_dorset
  20. # lib/enrich.rb
  21. require "json"
  22. require "net/http"
  23. require "uri"
  24. require_relative "./db"
  25. require_relative "./util"
  26. require_relative "./geocode"
  27. require_relative "./migrate"
  28. # Ensure all da_* tables have the expected columns before enriching.
  29. Migrate.run!
  30. LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
  31. ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
  32. def log_enrich(msg)
  33. puts msg if ENRICH_DEBUG
  34. end
  35. def ensure_extra_columns!(table)
  36. DB.validate_table_name!(table)
  37. esc = DB.client.escape(table)
  38. {
  39. "address_std" => "VARCHAR(255) NULL",
  40. "lat" => "DOUBLE NULL",
  41. "lng" => "DOUBLE NULL",
  42. "property_id" => "VARCHAR(50) NULL",
  43. "title_reference" => "VARCHAR(80) NULL",
  44. "document_url" => "TEXT NULL",
  45. "local_document_url" => "TEXT NULL",
  46. "on_notice_to" => "DATE NULL",
  47. "on_notice_to_raw" => "VARCHAR(80) NULL"
  48. }.each do |col, defn|
  49. DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
  50. rescue Mysql2::Error => e
  51. warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
  52. end
  53. end
  54. def http_post_json(url, payload, timeout: 15)
  55. uri = URI.parse(url)
  56. http = Net::HTTP.new(uri.host, uri.port)
  57. http.use_ssl = (uri.scheme == "https")
  58. http.read_timeout = timeout
  59. http.open_timeout = timeout
  60. req = Net::HTTP::Post.new(uri.request_uri)
  61. req["Content-Type"] = "application/json"
  62. req.body = JSON.generate(payload)
  63. res = http.request(req)
  64. raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
  65. JSON.parse(res.body)
  66. rescue JSON::ParserError
  67. {}
  68. end
  69. # Call this right after DB.upsert in each scraper
  70. # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
  71. def enrich_after_upsert!(table:, council_reference:, address:)
  72. DB.validate_table_name!(table)
  73. esc = DB.client.escape(table)
  74. sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
  75. row = sel.execute(council_reference, address).first
  76. return unless row
  77. # 1) Geocode if missing lat/lng or std address
  78. if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
  79. begin
  80. geo = Geocode.format_au(row["address"])
  81. Geocode.update_da_row!(
  82. table: table,
  83. council_reference: council_reference,
  84. orig_address: row["address"],
  85. geo: geo
  86. )
  87. log_enrich("enrich: geocoded #{table} #{council_reference}")
  88. # refresh row to fetch lat/lng for next step
  89. row = sel.execute(council_reference, address).first
  90. rescue => e
  91. warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
  92. end
  93. end
  94. # 2) LIST lookup only if we have coords and something’s missing
  95. need_pid = row["property_id"].to_s.strip.empty?
  96. need_title = row["title_reference"].to_s.strip.empty?
  97. if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
  98. begin
  99. resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
  100. if resp["ok"]
  101. pid = (resp["pid"] || "").to_s
  102. title = (resp["title_id"] || "").to_s
  103. upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
  104. upd.execute(pid, title, council_reference, address)
  105. log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
  106. else
  107. warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
  108. end
  109. rescue => e
  110. warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
  111. end
  112. end
  113. end