import_sqlites.rb 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env ruby
  2. require "sqlite3"
  3. require "time"
  4. require_relative "../lib/db"
  5. require_relative "../lib/util"
  6. SRC_DIR = ARGV[0] || "/app/sqlite_in"
  7. ONLY_TABLE = ENV["ONLY_TABLE"].to_s.strip
  8. DRY_RUN = ENV["DRY_RUN"] == "1"
  9. def say(s) puts s end
  10. def warnx(s) $stderr.puts s end
  11. def mysql_table_for(filename)
  12. base = File.basename(filename, ".sqlite").downcase
  13. "da_" + base.gsub(/[^a-z0-9]+/, "_")
  14. end
  15. def columns_for(db, table)
  16. db.execute("PRAGMA table_info(#{table})").map { |row| row[1].to_s }
  17. end
  18. def sql_ident(c)
  19. # Quote identifiers that have spaces or caps
  20. c =~ /\A[a-z_][a-z0-9_]*\z/i ? c : %Q{"#{c.gsub('"','""')}"}
  21. end
  22. def detect_source_table(db)
  23. preferred = %w[scraped_data data applications]
  24. all = db.execute("SELECT name FROM sqlite_master WHERE type='table'").map { |r| r[0].to_s }
  25. preferred.each { |t| return t if all.include?(t) }
  26. all.each do |t|
  27. cols = columns_for(db, t)
  28. if cols.any? { |c| c =~ /\baddress\b/i } && cols.any? { |c| c =~ /\bcouncil[_ ]?reference\b/i }
  29. return t
  30. end
  31. end
  32. all.first
  33. end
  34. def pick(cols, *candidates)
  35. candidates.find { |c| c && cols.include?(c) }
  36. end
  37. def build_select(db, src_table)
  38. cols = columns_for(db, src_table)
  39. mapping = {
  40. desc_col: pick(cols, "description", "type_of_work", "Type of Work"),
  41. date_col: pick(cols, "date_received", "date_lodged", "Date Lodged", "date_scraped"),
  42. addr_col: pick(cols, "address", "Address"),
  43. ref_col: pick(cols, "council_reference", "application_no", "application", "Application No.", "Application"),
  44. appl_col: pick(cols, "applicant", "Applicant"),
  45. owner_col: pick(cols, "owner", "Owner"),
  46. notice_col: pick(cols, "on_notice_to", "on_notice", "close_date", "Closes"),
  47. notice_raw_col: pick(cols, "on_notice_to_raw", "on_notice_raw", "close_date_raw"),
  48. title_col: pick(cols, "title_reference", "title", "name"),
  49. doc_col: pick(cols, "document_url", "document", "url", "info_url")
  50. }
  51. unless mapping[:addr_col] && mapping[:ref_col]
  52. raise "Source table #{src_table} missing address or council_reference columns"
  53. end
  54. sel = []
  55. sel << "#{mapping[:desc_col]} AS description" if mapping[:desc_col]
  56. #sel << "#{mapping[:date_col]} AS date_received" if mapping[:date_col]
  57. # Build a COALESCE(...) AS date_received from any present date columns
  58. date_candidates = %w[date_received date_lodged Date\ Lodged date_scraped].select { |c| cols.include?(c) }
  59. if date_candidates.any?
  60. # Treat empty strings as NULL so blanks don’t block fallbacks
  61. expr = date_candidates.map { |c| "NULLIF(#{sql_ident(c)}, '')" }.join(", ")
  62. sel << "COALESCE(#{expr}) AS date_received"
  63. end
  64. sel << "#{mapping[:addr_col]} AS address"
  65. sel << "#{mapping[:ref_col]} AS council_reference"
  66. sel << "#{mapping[:appl_col]} AS applicant" if mapping[:appl_col]
  67. sel << "#{mapping[:owner_col]} AS owner" if mapping[:owner_col]
  68. sel << "#{mapping[:notice_col]} AS on_notice_to" if mapping[:notice_col]
  69. sel << "#{mapping[:notice_raw_col]} AS on_notice_to_raw" if mapping[:notice_raw_col]
  70. sel << "#{mapping[:title_col]} AS title_reference" if mapping[:title_col]
  71. sel << "#{mapping[:doc_col]} AS document_url" if mapping[:doc_col]
  72. ["SELECT #{sel.join(", ")} FROM #{src_table}", mapping]
  73. end
  74. def ensure_target_table!(table)
  75. DB.ensure_table!(table)
  76. end
  77. def normalize_date(v)
  78. s = v.to_s.strip
  79. return nil if s.empty?
  80. Util.parse_aus_date(s) || (Date.parse(s) rescue nil)
  81. end
  82. def import_file(path)
  83. target_table = mysql_table_for(path)
  84. return if ONLY_TABLE != "" && target_table != ONLY_TABLE
  85. say "Importing #{File.basename(path)} → #{target_table}"
  86. ensure_target_table!(target_table)
  87. SQLite3::Database.new(path) do |sdb|
  88. sdb.results_as_hash = true
  89. src_table = detect_source_table(sdb)
  90. sql, _mapping = build_select(sdb, src_table)
  91. count = 0
  92. sdb.execute(sql) do |row|
  93. description = row["description"].to_s
  94. date_raw = row["date_received"].to_s
  95. address = row["address"].to_s[0,255]
  96. ref = row["council_reference"].to_s[0,100]
  97. applicant = row["applicant"].to_s
  98. owner = row["owner"].to_s
  99. on_notice_to_raw = row["on_notice_to_raw"].to_s
  100. on_notice_to_val = row["on_notice_to"]
  101. on_notice_date = normalize_date(on_notice_to_val || on_notice_to_raw)
  102. title_reference = row["title_reference"].to_s
  103. document_url = row["document_url"].to_s
  104. description = "Development Application" if description.strip.empty?
  105. if DRY_RUN && count < 5
  106. say " [dry] ref=#{ref.inspect} addr=#{address.inspect} date=#{date_raw.inspect}"
  107. end
  108. unless DRY_RUN
  109. DB.upsert(target_table, {
  110. description: description,
  111. date_received: normalize_date(date_raw),
  112. date_received_raw: date_raw,
  113. address: address,
  114. council_reference: ref,
  115. applicant: applicant,
  116. owner: owner
  117. })
  118. begin
  119. upd = DB.client.prepare(
  120. "UPDATE `#{DB.client.escape(target_table)}` " \
  121. "SET on_notice_to = ?, on_notice_to_raw = ?, title_reference = ?, document_url = ? " \
  122. "WHERE council_reference = ? AND address = ?"
  123. )
  124. upd.execute(on_notice_date, on_notice_to_raw, title_reference, document_url, ref, address)
  125. rescue => e
  126. warnx " extras update skipped for #{ref}: #{e.class} #{e.message}"
  127. end
  128. end
  129. count += 1
  130. end
  131. say " Imported #{count} row(s)"
  132. end
  133. rescue => e
  134. warnx " ERROR importing #{File.basename(path)}: #{e.class} #{e.message}"
  135. end
  136. unless Dir.exist?(SRC_DIR)
  137. warnx "Source dir not found: #{SRC_DIR}"
  138. exit 1
  139. end
  140. files = Dir.glob(File.join(SRC_DIR, "*.sqlite")).sort
  141. if files.empty?
  142. warnx "No .sqlite files in #{SRC_DIR}"
  143. exit 0
  144. end
  145. files.each { |f| import_file(f) }
  146. say "Done."