db.rb 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. require "mysql2"
  2. module DB
  3. # MySQL/MariaDB DDL statements (CREATE TABLE, ALTER TABLE) don't support
  4. # parameterized identifiers — only data values can be bound with ?. Using
  5. # client.escape() is the standard workaround for identifiers, but as a
  6. # defence-in-depth measure we also enforce that every table name matches
  7. # the expected da_* pattern before it is ever interpolated into SQL.
  8. SAFE_TABLE_NAME_RE = /\Ada_[a-z0-9_]+\z/
  9. def self.validate_table_name!(name)
  10. unless name.to_s.match?(SAFE_TABLE_NAME_RE)
  11. raise ArgumentError, "Unsafe table name: #{name.inspect} — must match #{SAFE_TABLE_NAME_RE}"
  12. end
  13. name
  14. end
  15. def self.client
  16. @client ||= Mysql2::Client.new(
  17. host: ENV.fetch("MYSQL_HOST", "127.0.0.1"),
  18. port: Integer(ENV.fetch("MYSQL_PORT", "3306")),
  19. database: ENV.fetch("MYSQL_DB", "planning_scrapes"),
  20. username: ENV.fetch("MYSQL_USER", "scraper"),
  21. password: ENV.fetch("MYSQL_PASSWORD", ""),
  22. encoding: "utf8mb4",
  23. reconnect: true
  24. )
  25. end
  26. def self.ensure_table!(table)
  27. validate_table_name!(table)
  28. client.query(<<~SQL)
  29. CREATE TABLE IF NOT EXISTS `#{client.escape(table)}` (
  30. id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
  31. description TEXT,
  32. date_received DATE NULL,
  33. date_received_raw VARCHAR(100),
  34. address VARCHAR(255) NOT NULL,
  35. council_reference VARCHAR(100) NOT NULL,
  36. applicant VARCHAR(255),
  37. owner VARCHAR(255),
  38. created_at DATETIME NOT NULL,
  39. updated_at DATETIME NOT NULL,
  40. on_notice_to DATE NULL,
  41. on_notice_to_raw VARCHAR(80) NULL,
  42. title_reference TEXT NULL,
  43. property_id TEXT NULL,
  44. area_sqm DOUBLE NULL,
  45. area_ha DOUBLE NULL,
  46. address_std VARCHAR(255) NULL,
  47. street VARCHAR(120) NULL,
  48. locality VARCHAR(120) NULL,
  49. state VARCHAR(10) NULL,
  50. postcode VARCHAR(10) NULL,
  51. document_url TEXT NULL,
  52. local_document_url TEXT NULL,
  53. documents_json MEDIUMTEXT NULL,
  54. lat DECIMAL(10,7) NULL,
  55. lng DECIMAL(10,7) NULL,
  56. PRIMARY KEY (id),
  57. UNIQUE KEY uniq_ref_addr (council_reference, address)
  58. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
  59. SQL
  60. end
  61. # Write-once / merge semantics for specific columns on duplicate key.
  62. # All other columns default to VALUES(`col`) (last-write-wins).
  63. UPSERT_ON_DUP = {
  64. date_received: "`date_received` = IFNULL(`date_received`, VALUES(`date_received`))",
  65. date_received_raw: "`date_received_raw` = CASE WHEN `date_received_raw` IS NULL OR `date_received_raw` = '' THEN VALUES(`date_received_raw`) ELSE `date_received_raw` END",
  66. document_url: "`document_url` = COALESCE(VALUES(`document_url`), `document_url`)",
  67. local_document_url: "`local_document_url` = COALESCE(VALUES(`local_document_url`), `local_document_url`)",
  68. }.freeze
  69. SAFE_COLUMN_RE = /\A[a-z][a-z0-9_]*\z/
  70. def self.upsert(table, row)
  71. validate_table_name!(table)
  72. columns = row.keys
  73. columns.each do |c|
  74. raise ArgumentError, "Unsafe column name: #{c.inspect}" unless c.to_s.match?(SAFE_COLUMN_RE)
  75. end
  76. esc_table = client.escape(table)
  77. col_names = columns.map { |c| "`#{c}`" }.join(", ")
  78. placeholders = (["?"] * columns.size).join(", ")
  79. updates = columns.map { |c| UPSERT_ON_DUP[c.to_sym] || "`#{c}` = VALUES(`#{c}`)" }.join(", ")
  80. sql = <<~SQL
  81. INSERT INTO `#{esc_table}` (#{col_names}, created_at, updated_at)
  82. VALUES (#{placeholders}, NOW(), NOW())
  83. ON DUPLICATE KEY UPDATE #{updates}, updated_at = NOW()
  84. SQL
  85. stmt = client.prepare(sql)
  86. stmt.execute(*columns.map { |c| row[c] })
  87. end
  88. end