| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- require "mysql2"
- module DB
- # MySQL/MariaDB DDL statements (CREATE TABLE, ALTER TABLE) don't support
- # parameterized identifiers — only data values can be bound with ?. Using
- # client.escape() is the standard workaround for identifiers, but as a
- # defence-in-depth measure we also enforce that every table name matches
- # the expected da_* pattern before it is ever interpolated into SQL.
- SAFE_TABLE_NAME_RE = /\Ada_[a-z0-9_]+\z/
- def self.validate_table_name!(name)
- unless name.to_s.match?(SAFE_TABLE_NAME_RE)
- raise ArgumentError, "Unsafe table name: #{name.inspect} — must match #{SAFE_TABLE_NAME_RE}"
- end
- name
- end
- def self.client
- @client ||= Mysql2::Client.new(
- host: ENV.fetch("MYSQL_HOST", "127.0.0.1"),
- port: Integer(ENV.fetch("MYSQL_PORT", "3306")),
- database: ENV.fetch("MYSQL_DB", "planning_scrapes"),
- username: ENV.fetch("MYSQL_USER", "scraper"),
- password: ENV.fetch("MYSQL_PASSWORD", ""),
- encoding: "utf8mb4",
- reconnect: true
- )
- end
- def self.ensure_table!(table)
- validate_table_name!(table)
- client.query(<<~SQL)
- CREATE TABLE IF NOT EXISTS `#{client.escape(table)}` (
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
- description TEXT,
- date_received DATE NULL,
- date_received_raw VARCHAR(100),
- address VARCHAR(255) NOT NULL,
- council_reference VARCHAR(100) NOT NULL,
- applicant VARCHAR(255),
- owner VARCHAR(255),
- created_at DATETIME NOT NULL,
- updated_at DATETIME NOT NULL,
- on_notice_to DATE NULL,
- on_notice_to_raw VARCHAR(80) NULL,
- title_reference TEXT NULL,
- property_id TEXT NULL,
- area_sqm DOUBLE NULL,
- area_ha DOUBLE NULL,
- address_std VARCHAR(255) NULL,
- street VARCHAR(120) NULL,
- locality VARCHAR(120) NULL,
- state VARCHAR(10) NULL,
- postcode VARCHAR(10) NULL,
- document_url TEXT NULL,
- local_document_url TEXT NULL,
- documents_json MEDIUMTEXT NULL,
- status VARCHAR(100) NULL,
- assigned_officer VARCHAR(255) NULL,
- `group` VARCHAR(100) NULL,
- category VARCHAR(100) NULL,
- application_valid DATE NULL,
- application_valid_raw VARCHAR(80) NULL,
- advertised_on DATE NULL,
- advertised_on_raw VARCHAR(80) NULL,
- property_legal_description TEXT NULL,
- lat DECIMAL(10,7) NULL,
- lng DECIMAL(10,7) NULL,
- PRIMARY KEY (id),
- UNIQUE KEY uniq_ref_addr (council_reference, address)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
- SQL
- end
- # Write-once / merge semantics for specific columns on duplicate key.
- # All other columns default to VALUES(`col`) (last-write-wins).
- UPSERT_ON_DUP = {
- date_received: "`date_received` = IFNULL(`date_received`, VALUES(`date_received`))",
- date_received_raw: "`date_received_raw` = CASE WHEN `date_received_raw` IS NULL OR `date_received_raw` = '' THEN VALUES(`date_received_raw`) ELSE `date_received_raw` END",
- document_url: "`document_url` = COALESCE(VALUES(`document_url`), `document_url`)",
- local_document_url: "`local_document_url` = COALESCE(VALUES(`local_document_url`), `local_document_url`)",
- }.freeze
- SAFE_COLUMN_RE = /\A[a-z][a-z0-9_]*\z/
- def self.upsert(table, row)
- validate_table_name!(table)
- columns = row.keys
- columns.each do |c|
- raise ArgumentError, "Unsafe column name: #{c.inspect}" unless c.to_s.match?(SAFE_COLUMN_RE)
- end
- esc_table = client.escape(table)
- col_names = columns.map { |c| "`#{c}`" }.join(", ")
- placeholders = (["?"] * columns.size).join(", ")
- updates = columns.map { |c| UPSERT_ON_DUP[c.to_sym] || "`#{c}` = VALUES(`#{c}`)" }.join(", ")
- sql = <<~SQL
- INSERT INTO `#{esc_table}` (#{col_names}, created_at, updated_at)
- VALUES (#{placeholders}, NOW(), NOW())
- ON DUPLICATE KEY UPDATE #{updates}, updated_at = NOW()
- SQL
- stmt = client.prepare(sql)
- stmt.execute(*columns.map { |c| row[c] })
- end
- end
|