Benjamin Harris пре 2 месеци
родитељ
комит
c03bfae
7 измењених фајлова са 80 додато и 32 уклоњено
  1. 2 0
      docker-compose.yml
  2. 7 11
      lib/enrich.rb
  3. 5 4
      lib/geocode.rb
  4. 51 0
      lib/log.rb
  5. 5 4
      lib/migrate.rb
  6. 3 2
      lib/scraper_helpers.rb
  7. 7 11
      scrapers/enrich.rb

+ 2 - 0
docker-compose.yml

@@ -41,6 +41,8 @@ services:
       DOWNLOAD_DIR: /downloads
       LOOKUP_URL: ${LOOKUP_URL}
       LOOKUP_THROTTLE_MS: ${LOOKUP_THROTTLE_MS:-150}
+      # Log verbosity: debug | info (default) | warn | error
+      LOG_LEVEL: ${LOG_LEVEL:-info}
     volumes:
       - ./scrapers:/app/scrapers:ro
       - ./downloads:/downloads

+ 7 - 11
lib/enrich.rb

@@ -25,13 +25,9 @@ require "uri"
 require_relative "./db"
 require_relative "./util"
 require_relative "./geocode"
+require_relative "./log"
 
-LOOKUP_URL   = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
-ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
-
-def log_enrich(msg)
-  puts msg if ENRICH_DEBUG
-end
+LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
 
 def ensure_extra_columns!(table)
   DB.validate_table_name!(table)
@@ -49,7 +45,7 @@ def ensure_extra_columns!(table)
   }.each do |col, defn|
     DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
   rescue Mysql2::Error => e
-    warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
+    Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
   end
 end
 
@@ -88,7 +84,7 @@ def enrich_after_upsert!(table:, council_reference:, address:)
         orig_address: row["address"],
         geo: geo
       )
-      log_enrich("enrich: geocoded #{table} #{council_reference}")
+      Log.debug "enrich", "geocoded #{table} #{council_reference}"
       # refresh row to fetch lat/lng for next step
       row = sel.execute(council_reference, address).first
     rescue => e
@@ -107,12 +103,12 @@ def enrich_after_upsert!(table:, council_reference:, address:)
         title = (resp["title_id"] || "").to_s
         upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
         upd.execute(pid, title, council_reference, address)
-        log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
+        Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
       else
-        warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
+        Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
       end
     rescue => e
-      warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
+      Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
     end
   end
 end

+ 5 - 4
lib/geocode.rb

@@ -4,6 +4,7 @@ require "digest/sha1"
 require "cgi"
 require_relative "./db"
 require_relative "./http"
+require_relative "./log"
 
 module Geocode
   class Error < StandardError; end
@@ -46,7 +47,7 @@ module Geocode
     DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS lat DECIMAL(10,7) NULL")
     DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS lng DECIMAL(10,7) NULL")
   rescue Mysql2::Error => e
-    warn "[geocode] ensure columns skipped for #{table}: #{e.message}"
+    Log.warn "geocode", "ensure columns skipped for #{table}: #{e.message}"
   end
 
   # Public helper to geocode and return a hash of normalized components
@@ -100,11 +101,11 @@ module Geocode
 
     res
   rescue Error => e
-    warn "[geocode] #{e.message}"
+    Log.error "geocode", e.message
     nil
   rescue Net::HTTPError, Net::ReadTimeout, Net::OpenTimeout, OpenSSL::SSL::SSLError,
          Errno::ECONNRESET, EOFError, Mysql2::Error => e
-    warn "[geocode] network/db error for #{raw_address.inspect}: #{e.class} #{e.message}"
+    Log.warn "geocode", "network/db error for #{raw_address.inspect}: #{e.class} #{e.message}"
     nil
   end
 
@@ -140,7 +141,7 @@ module Geocode
       orig_address
     )
   rescue Mysql2::Error => e
-    warn "[geocode] failed to update normalized address for #{table}/#{council_reference}: #{e.message}"
+    Log.warn "geocode", "failed to update normalized address for #{table}/#{council_reference}: #{e.message}"
   end
 
   # Helpers

+ 51 - 0
lib/log.rb

@@ -0,0 +1,51 @@
+# lib/log.rb
+# Minimal structured logger for the scraping pipeline.
+#
+# Usage:
+#   require_relative "log"
+#   Log.info  "geocode", "geocoded DA0306/2025 -> 42 Main St, Hobart"
+#   Log.warn  "enrich",  "lookup failed for da_foo DA123: connection refused"
+#   Log.debug "migrate", "column address_std already exists — skipped"
+#   Log.error "db",      "prepare failed: unknown column 'foo'"
+#
+# Output format (no timestamp — Docker/systemd adds one):
+#   INFO  [geocode] geocoded DA0306/2025 -> 42 Main St, Hobart
+#   WARN  [enrich]  lookup failed for da_foo DA123: connection refused
+#
+# Verbosity is controlled by the LOG_LEVEL environment variable:
+#   LOG_LEVEL=debug  — all messages
+#   LOG_LEVEL=info   — info, warn, error          (default)
+#   LOG_LEVEL=warn   — warn and error only
+#   LOG_LEVEL=error  — errors only
+#
+# INFO and DEBUG go to $stdout; WARN and ERROR go to $stderr so that
+# docker compose logs shows them on the correct stream.
+
+module Log
+  LEVELS = { debug: 0, info: 1, warn: 2, error: 3 }.freeze
+
+  # Flush immediately — important in Docker where stdout may be block-buffered.
+  $stdout.sync = true
+  $stderr.sync = true
+
+  def self.debug(component, msg) = emit(:debug, component, msg)
+  def self.info(component, msg)  = emit(:info,  component, msg)
+  def self.warn(component, msg)  = emit(:warn,  component, msg)
+  def self.error(component, msg) = emit(:error, component, msg)
+
+  # ---------------------------------------------------------------------------
+
+  def self.min_level
+    key = ENV.fetch("LOG_LEVEL", "info").strip.downcase.to_sym
+    LEVELS.fetch(key, LEVELS[:info])
+  end
+  private_class_method :min_level
+
+  def self.emit(level, component, msg)
+    return if LEVELS.fetch(level) < min_level
+    label  = level.to_s.upcase.ljust(5)
+    stream = (level == :debug || level == :info) ? $stdout : $stderr
+    stream.puts "#{label} [#{component}] #{msg}"
+  end
+  private_class_method :emit
+end

+ 5 - 4
lib/migrate.rb

@@ -15,6 +15,7 @@
 #   ruby /app/lib/migrate.rb
 
 require_relative "./db"
+require_relative "./log"
 
 module Migrate
   # -------------------------------------------------------------------------
@@ -53,7 +54,7 @@ module Migrate
               "ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}"
             )
           rescue Mysql2::Error => e
-            warn "[migrate] skipped #{table}.#{col}: #{e.message}"
+            Log.warn "migrate", "skipped #{table}.#{col}: #{e.message}"
           end
         end
       }
@@ -95,18 +96,18 @@ module Migrate
 
     pending = MIGRATIONS.reject { |m| applied.include?(m[:version]) }
     if pending.empty?
-      puts "[migrate] schema up to date (#{MIGRATIONS.size} migration(s) applied)"
+      Log.info "migrate", "schema up to date (#{MIGRATIONS.size} migration(s) applied)"
       return
     end
 
     pending.each do |m|
-      puts "[migrate] applying v#{m[:version]}: #{m[:description]}"
+      Log.info "migrate", "applying v#{m[:version]}: #{m[:description]}"
       m[:up].call
       stmt = DB.client.prepare(
         "INSERT INTO schema_migrations (version, description) VALUES (?, ?)"
       )
       stmt.execute(m[:version], m[:description])
-      puts "[migrate] v#{m[:version]} done"
+      Log.info "migrate", "v#{m[:version]} done"
     end
   end
 

+ 3 - 2
lib/scraper_helpers.rb

@@ -11,6 +11,7 @@
 require "uri"
 require_relative "./db"
 require_relative "./enrich"
+require_relative "./log"
 
 # Resolve a possibly-relative href against a base URL.
 # Returns the href string unchanged if URI.join raises.
@@ -51,8 +52,8 @@ def upsert_and_enrich!(table:, row:, extras: {})
       )
       upd.execute(*vals)
     rescue Mysql2::Error => e
-      warn "[scraper_helpers] extras update skipped for #{row[:council_reference]}: #{e.message}"
+      Log.warn "scraper", "extras update skipped for #{row[:council_reference]}: #{e.message}"
     end
   end
-  puts "Upserted #{row[:council_reference]} -> #{row[:address]}"
+  Log.info "scraper", "upserted #{row[:council_reference]} -> #{row[:address]}"
 end

+ 7 - 11
scrapers/enrich.rb

@@ -26,16 +26,12 @@ require_relative "./db"
 require_relative "./util"
 require_relative "./geocode"
 require_relative "./migrate"
+require_relative "./log"
 
 # Ensure all da_* tables have the expected columns before enriching.
 Migrate.run!
 
-LOOKUP_URL   = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
-ENRICH_DEBUG = ENV["ENRICH_DEBUG"] == "1"
-
-def log_enrich(msg)
-  puts msg if ENRICH_DEBUG
-end
+LOOKUP_URL = ENV["LOOKUP_URL"] # e.g. http://web/list_lookup.php
 
 def ensure_extra_columns!(table)
   DB.validate_table_name!(table)
@@ -53,7 +49,7 @@ def ensure_extra_columns!(table)
   }.each do |col, defn|
     DB.client.query("ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}")
   rescue Mysql2::Error => e
-    warn "[enrich] schema migration skipped for #{table}.#{col}: #{e.message}"
+    Log.warn "enrich", "schema migration skipped for #{table}.#{col}: #{e.message}"
   end
 end
 
@@ -92,7 +88,7 @@ def enrich_after_upsert!(table:, council_reference:, address:)
         orig_address: row["address"],
         geo: geo
       )
-      log_enrich("enrich: geocoded #{table} #{council_reference}")
+      Log.debug "enrich", "geocoded #{table} #{council_reference}"
       # refresh row to fetch lat/lng for next step
       row = sel.execute(council_reference, address).first
     rescue => e
@@ -111,12 +107,12 @@ def enrich_after_upsert!(table:, council_reference:, address:)
         title = (resp["title_id"] || "").to_s
         upd = DB.client.prepare("UPDATE `#{esc}` SET property_id = COALESCE(NULLIF(?,’’), property_id), title_reference = COALESCE(NULLIF(?,’’), title_reference) WHERE council_reference = ? AND address = ?")
         upd.execute(pid, title, council_reference, address)
-        log_enrich("enrich: lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}")
+        Log.debug "enrich", "lookup ok #{table} #{council_reference} pid=#{pid} title=#{title}"
       else
-        warn "[enrich] lookup error #{table} #{council_reference}: #{resp["error"]}"
+        Log.warn "enrich", "lookup error #{table} #{council_reference}: #{resp["error"]}"
       end
     rescue => e
-      warn "[enrich] lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
+      Log.warn "enrich", "lookup failed #{table} #{council_reference}: #{e.class} #{e.message}"
     end
   end
 end