| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- require "mysql2"
- module DB
- # MySQL/MariaDB DDL statements (CREATE TABLE, ALTER TABLE) don't support
- # parameterized identifiers — only data values can be bound with ?. Using
- # client.escape() is the standard workaround for identifiers, but as a
- # defence-in-depth measure we also enforce that every table name matches
- # the expected da_* pattern before it is ever interpolated into SQL.
- SAFE_TABLE_NAME_RE = /\Ada_[a-z0-9_]+\z/
- def self.validate_table_name!(name)
- unless name.to_s.match?(SAFE_TABLE_NAME_RE)
- raise ArgumentError, "Unsafe table name: #{name.inspect} — must match #{SAFE_TABLE_NAME_RE}"
- end
- name
- end
- def self.client
- @client ||= Mysql2::Client.new(
- host: ENV.fetch("MYSQL_HOST", "127.0.0.1"),
- port: Integer(ENV.fetch("MYSQL_PORT", "3306")),
- database: ENV.fetch("MYSQL_DB", "planning_scrapes"),
- username: ENV.fetch("MYSQL_USER", "scraper"),
- password: ENV.fetch("MYSQL_PASSWORD", ""),
- encoding: "utf8mb4",
- reconnect: true
- )
- end
- def self.ensure_table!(table)
- validate_table_name!(table)
- client.query(<<~SQL)
- CREATE TABLE IF NOT EXISTS `#{client.escape(table)}` (
- id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
- description TEXT,
- date_received DATE NULL,
- date_received_raw VARCHAR(100),
- address VARCHAR(255) NOT NULL,
- council_reference VARCHAR(100) NOT NULL,
- applicant VARCHAR(255),
- owner VARCHAR(255),
- created_at DATETIME NOT NULL,
- updated_at DATETIME NOT NULL,
- on_notice_to DATE NULL,
- on_notice_to_raw VARCHAR(80) NULL,
- title_reference TEXT NULL,
- property_id TEXT NULL,
- area_sqm DOUBLE NULL,
- area_ha DOUBLE NULL,
- address_std VARCHAR(255) NULL,
- street VARCHAR(120) NULL,
- locality VARCHAR(120) NULL,
- state VARCHAR(10) NULL,
- postcode VARCHAR(10) NULL,
- document_url TEXT NULL,
- local_document_url TEXT NULL,
- lat DECIMAL(10,7) NULL,
- lng DECIMAL(10,7) NULL,
- PRIMARY KEY (id),
- UNIQUE KEY uniq_ref_addr (council_reference, address)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
- SQL
- end
- def self.upsert(table, row)
- validate_table_name!(table)
- columns = [
- :description,
- :date_received,
- :date_received_raw,
- :address,
- :council_reference,
- :applicant,
- :owner,
- :local_document_url,
- :document_url,
- :on_notice_to,
- :on_notice_to_raw,
- :title_reference,
- :property_id,
- :area_sqm,
- :area_ha,
- :address_std,
- :street,
- :locality,
- :state,
- :postcode,
- :lat,
- :lng
- ]
- esc_table = client.escape(table)
- col_names = columns.map { |c| "`#{c}`" }.join(", ")
- placeholders = (["?"] * columns.size).join(", ")
- updates = columns.map { |c|
- case c
- when :date_received
- # write-once: only set if currently NULL
- "`#{c}` = IFNULL(`#{c}`, VALUES(`#{c}`))"
- when :date_received_raw
- # write-once for strings: only set if NULL or ''
- "`#{c}` = CASE WHEN `#{c}` IS NULL OR `#{c}` = '' THEN VALUES(`#{c}`) ELSE `#{c}` END"
- when :document_url, :local_document_url
- # don't blank out existing value if new is NULL
- "`#{c}` = COALESCE(VALUES(`#{c}`), `#{c}`)"
- else
- "`#{c}` = VALUES(`#{c}`)"
- end
- }.join(", ")
- sql = <<~SQL
- INSERT INTO `#{esc_table}` (#{col_names}, created_at, updated_at)
- VALUES (#{placeholders}, NOW(), NOW())
- ON DUPLICATE KEY UPDATE
- #{updates},
- updated_at = NOW()
- SQL
- stmt = client.prepare(sql)
- values = columns.map { |c| row[c] }
- stmt.execute(*values)
- end
- end
|