enrich.rb 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. # tools/enrich.rb
  2. # Enrich DA rows AFTER scrapers:
  3. # - Geocode (address_std, street/locality/state/postcode, lat/lng)
  4. # - PID + Title via list_lookup.php (property_id, title_reference, area_sqm/ha)
  5. #
  6. # Usage examples:
  7. # docker compose run --rm \
  8. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  9. # -e LOOKUP_URL="http://web/list_lookup.php" \
  10. # scraper ruby /app/tools/enrich.rb
  11. #
  12. # # Single table, slower throttle, dry run:
  13. # docker compose run --rm \
  14. # -e GOOGLE_MAPS_API_KEY="$GOOGLE_MAPS_API_KEY" \
  15. # -e LOOKUP_URL="http://web/list_lookup.php" \
  16. # -e GEOCODE_LIMIT=200 -e GEOCODE_THROTTLE_MS=200 \
  17. # -e LOOKUP_LIMIT=200 -e LOOKUP_THROTTLE_MS=250 \
  18. # -e DRY_RUN=1 \
  19. # scraper ruby /app/tools/enrich.rb --table=da_dorset
  20. # lib/enrich.rb
  21. require "json"
  22. require "net/http"
  23. require "uri"
  24. require_relative "./db"
  25. require_relative "./util"
  26. require_relative "./geocode"
  27. require_relative "./log"
  28. LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
  29. def ensure_extra_columns!(table)
  30. DB.validate_table_name!(table)
  31. esc = DB.client.escape(table)
  32. {
  33. "address_std" => "VARCHAR(255) NULL",
  34. "lat" => "DOUBLE NULL",
  35. "lng" => "DOUBLE NULL",
  36. "property_id" => "VARCHAR(50) NULL",
  37. "title_reference" => "VARCHAR(80) NULL",
  38. "document_url" => "TEXT NULL",
  39. "local_document_url" => "TEXT NULL",
  40. "on_notice_to" => "DATE NULL",
  41. "on_notice_to_raw" => "VARCHAR(80) NULL"
  42. }.each do |col, defn|
  43. DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
  44. rescue Mysql2::Error => e
  45. Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
  46. end
  47. end
  48. def http_post_json(url, payload, timeout: 15)
  49. uri = URI.parse(url)
  50. http = Net::HTTP.new(uri.host, uri.port)
  51. http.use_ssl = (uri.scheme == "https")
  52. http.read_timeout = timeout
  53. http.open_timeout = timeout
  54. req = Net::HTTP::Post.new(uri.request_uri)
  55. req["Content-Type"] = "application/json"
  56. req.body = JSON.generate(payload)
  57. res = http.request(req)
  58. raise "HTTP #{res.code}" unless res.is_a?(Net::HTTPSuccess)
  59. JSON.parse(res.body)
  60. rescue JSON::ParserError
  61. {}
  62. end
  63. # Call this right after DB.upsert in each scraper
  64. # enrich_after_upsert!(table: TABLE, council_reference: council_reference, address: address)
  65. def enrich_after_upsert!(table:, council_reference:, address:)
  66. DB.validate_table_name!(table)
  67. esc = DB.client.escape(table)
  68. sel = DB.client.prepare("SELECT id, address, address_std, lat, lng, property_id, title_reference FROM `#{esc}` WHERE council_reference = ? AND address = ? LIMIT 1")
  69. row = sel.execute(council_reference, address).first
  70. return unless row
  71. # 1) Geocode if missing lat/lng or std address
  72. if row["lat"].nil? || row["lng"].nil? || (row["address_std"].to_s.strip.empty?)
  73. begin
  74. geo = Geocode.format_au(row["address"])
  75. Geocode.update_da_row!(
  76. table: table,
  77. council_reference: council_reference,
  78. orig_address: row["address"],
  79. geo: geo
  80. )
  81. Log.debug "enrich", "geocoded #{table} #{council_reference}"
  82. # refresh row to fetch lat/lng for next step
  83. row = sel.execute(council_reference, address).first
  84. rescue => e
  85. warn "[enrich] geocode failed #{table} #{council_reference}: #{e.class} #{e.message}"
  86. end
  87. end
  88. # 2) LIST lookup only if we have coords and something’s missing
  89. need_pid = row["property_id"].to_s.strip.empty?
  90. need_title = row["title_reference"].to_s.strip.empty?
  91. if LOOKUP_URL && row["lat"] && row["lng"] && (need_pid || need_title)
  92. begin
  93. resp = http_post_json(LOOKUP_URL, { lat: row["lat"], lng: row["lng"] })
  94. if resp["ok"]
  95. pid = (resp["pid"] || "").to_s
  96. title = (resp["title_id"] || "").to_s
  97. upd = DB.client.prepare(
  98. "UPDATE `#{esc}` SET " \
  99. "property_id = COALESCE(?, property_id), " \
  100. "title_reference = COALESCE(?, title_reference) " \
  101. "WHERE council_reference = ? AND address = ?"
  102. )
  103. upd.execute(
  104. pid.empty? ? nil : pid,
  105. title.empty? ? nil : title,
  106. council_reference, address
  107. )
  108. Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
  109. else
  110. Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
  111. end
  112. rescue => e
  113. Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
  114. end
  115. end
  116. end