require "mysql2" module DB # MySQL/MariaDB DDL statements (CREATE TABLE, ALTER TABLE) don't support # parameterized identifiers — only data values can be bound with ?. Using # client.escape() is the standard workaround for identifiers, but as a # defence-in-depth measure we also enforce that every table name matches # the expected da_* pattern before it is ever interpolated into SQL. SAFE_TABLE_NAME_RE = /\Ada_[a-z0-9_]+\z/ def self.validate_table_name!(name) unless name.to_s.match?(SAFE_TABLE_NAME_RE) raise ArgumentError, "Unsafe table name: #{name.inspect} — must match #{SAFE_TABLE_NAME_RE}" end name end def self.client @client ||= Mysql2::Client.new( host: ENV.fetch("MYSQL_HOST", "127.0.0.1"), port: Integer(ENV.fetch("MYSQL_PORT", "3306")), database: ENV.fetch("MYSQL_DB", "planning_scrapes"), username: ENV.fetch("MYSQL_USER", "scraper"), password: ENV.fetch("MYSQL_PASSWORD", ""), encoding: "utf8mb4", reconnect: true ) end def self.ensure_table!(table) validate_table_name!(table) client.query(<<~SQL) CREATE TABLE IF NOT EXISTS `#{client.escape(table)}` ( id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, description TEXT, date_received DATE NULL, date_received_raw VARCHAR(100), address VARCHAR(255) NOT NULL, council_reference VARCHAR(100) NOT NULL, applicant VARCHAR(255), owner VARCHAR(255), created_at DATETIME NOT NULL, updated_at DATETIME NOT NULL, on_notice_to DATE NULL, on_notice_to_raw VARCHAR(80) NULL, title_reference TEXT NULL, property_id TEXT NULL, area_sqm DOUBLE NULL, area_ha DOUBLE NULL, address_std VARCHAR(255) NULL, street VARCHAR(120) NULL, locality VARCHAR(120) NULL, state VARCHAR(10) NULL, postcode VARCHAR(10) NULL, document_url TEXT NULL, local_document_url TEXT NULL, documents_json MEDIUMTEXT NULL, lat DECIMAL(10,7) NULL, lng DECIMAL(10,7) NULL, PRIMARY KEY (id), UNIQUE KEY uniq_ref_addr (council_reference, address) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; SQL end # Write-once / merge semantics for specific columns on duplicate key. # All other columns default to VALUES(`col`) (last-write-wins). UPSERT_ON_DUP = { date_received: "`date_received` = IFNULL(`date_received`, VALUES(`date_received`))", date_received_raw: "`date_received_raw` = CASE WHEN `date_received_raw` IS NULL OR `date_received_raw` = '' THEN VALUES(`date_received_raw`) ELSE `date_received_raw` END", document_url: "`document_url` = COALESCE(VALUES(`document_url`), `document_url`)", local_document_url: "`local_document_url` = COALESCE(VALUES(`local_document_url`), `local_document_url`)", }.freeze SAFE_COLUMN_RE = /\A[a-z][a-z0-9_]*\z/ def self.upsert(table, row) validate_table_name!(table) columns = row.keys columns.each do |c| raise ArgumentError, "Unsafe column name: #{c.inspect}" unless c.to_s.match?(SAFE_COLUMN_RE) end esc_table = client.escape(table) col_names = columns.map { |c| "`#{c}`" }.join(", ") placeholders = (["?"] * columns.size).join(", ") updates = columns.map { |c| UPSERT_ON_DUP[c.to_sym] || "`#{c}` = VALUES(`#{c}`)" }.join(", ") sql = <<~SQL INSERT INTO `#{esc_table}` (#{col_names}, created_at, updated_at) VALUES (#{placeholders}, NOW(), NOW()) ON DUPLICATE KEY UPDATE #{updates}, updated_at = NOW() SQL stmt = client.prepare(sql) stmt.execute(*columns.map { |c| row[c] }) end end