classify_pdfs.rb 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. # tools/classify_pdfs.rb
  2. #
  3. # LLM-based classification of downloaded DA PDFs.
  4. #
  5. # Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows
  6. # where local_document_url IS NOT NULL and application_type IS NULL, extracts
  7. # text from the PDF with pdftotext, and asks a local Ollama model to classify
  8. # the application type.
  9. #
  10. # Usage:
  11. # ruby /app/tools/classify_pdfs.rb
  12. #
  13. # Environment variables:
  14. # ONLY_TABLE — process a single table, e.g. da_northernmidlands
  15. # RECLASSIFY — set to "1" to overwrite existing application_type values
  16. # LLM_MODEL — Ollama model name (default: llama3.2)
  17. # LLAMA_URL — Ollama base URL (default: http://192.168.8.73:11434)
  18. # DOWNLOAD_DIR — where PDFs are stored (default: /app/downloads)
  19. # DRY_RUN — set to "1" to print classifications without writing to DB
  20. # LOG_LEVEL — debug | info (default) | warn | error
  21. require "json"
  22. require "net/http"
  23. require "open3"
  24. require "uri"
  25. require_relative "../lib/db"
  26. require_relative "../lib/log"
  27. LLAMA_URL = ENV.fetch("LLAMA_URL", "http://192.168.8.73:11434")
  28. LLM_MODEL = ENV.fetch("LLM_MODEL", "llama3.2")
  29. DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads")
  30. DRY_RUN = ENV["DRY_RUN"] == "1"
  31. RECLASSIFY = ENV["RECLASSIFY"] == "1"
  32. ONLY_TABLE = ENV["ONLY_TABLE"]
  33. APPLICATION_TYPES = %w[
  34. Residential
  35. Commercial
  36. Industrial
  37. Subdivision
  38. Rural/Agriculture
  39. Tourism/Visitor\ Accommodation
  40. Outbuilding/Shed
  41. Change\ of\ Use
  42. Demolition
  43. Signage
  44. Other
  45. ].freeze
  46. PROMPT_TEMPLATE = <<~PROMPT
  47. You are classifying a Tasmanian planning development application.
  48. Read the following text and return ONLY the single most appropriate
  49. application type from this list:
  50. Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
  51. Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
  52. Demolition, Signage, Other
  53. Text:
  54. %s
  55. Reply with the type only. No explanation.
  56. PROMPT
  57. # ---------------------------------------------------------------------------
  58. # PDF text extraction
  59. # ---------------------------------------------------------------------------
  60. def extract_pdf_text(local_url, max_chars: 2000)
  61. # local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf"
  62. # Map to filesystem path: /files/ → DOWNLOAD_DIR/
  63. fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/")
  64. unless File.exist?(fs_path)
  65. Log.warn "classify", "PDF not found: #{fs_path}"
  66. return nil
  67. end
  68. # -l 3: only first 3 pages (sufficient for cover/description page)
  69. text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
  70. unless status.success?
  71. Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}"
  72. return nil
  73. end
  74. text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
  75. rescue StandardError => e
  76. Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}"
  77. nil
  78. end
  79. # ---------------------------------------------------------------------------
  80. # LLM call
  81. # ---------------------------------------------------------------------------
  82. def llm_classify(text)
  83. prompt = PROMPT_TEMPLATE % text
  84. uri = URI("#{LLAMA_URL}/api/generate")
  85. body = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false)
  86. res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http|
  87. http.post(uri.path, body, "Content-Type" => "application/json")
  88. end
  89. unless res.is_a?(Net::HTTPSuccess)
  90. Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}"
  91. return nil
  92. end
  93. JSON.parse(res.body)["response"].to_s.strip
  94. rescue StandardError => e
  95. Log.warn "classify", "LLM error: #{e.class} #{e.message}"
  96. nil
  97. end
  98. # ---------------------------------------------------------------------------
  99. # Normalise raw LLM response to one of the known types
  100. # ---------------------------------------------------------------------------
  101. def normalise_type(raw)
  102. return nil if raw.nil? || raw.strip.empty?
  103. # Strip any <think>...</think> tags (Qwen3 thinking mode artefact)
  104. cleaned = raw.gsub(/<think>.*?<\/think>/m, "").strip
  105. # Strip leading/trailing punctuation and whitespace
  106. candidate = cleaned.split(/[\n.!?]/).first.to_s.strip
  107. # Case-insensitive match against known types
  108. APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } ||
  109. APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } ||
  110. "Other"
  111. end
  112. # ---------------------------------------------------------------------------
  113. # Tables to process
  114. # ---------------------------------------------------------------------------
  115. def target_tables
  116. if ONLY_TABLE
  117. DB.validate_table_name!(ONLY_TABLE)
  118. [ONLY_TABLE]
  119. else
  120. rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'")
  121. rs.map { |r| r.values.first }
  122. end
  123. end
  124. # ---------------------------------------------------------------------------
  125. # Main
  126. # ---------------------------------------------------------------------------
  127. Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})"
  128. total_classified = 0
  129. total_skipped = 0
  130. total_errors = 0
  131. target_tables.each do |table|
  132. esc = DB.client.escape(table)
  133. # Check the table has the classification columns (may not after a fresh
  134. # migrate if running before migrate has been applied)
  135. begin
  136. DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0")
  137. rescue Mysql2::Error
  138. Log.warn "classify", "#{table} missing application_type column — run migrate.rb first"
  139. next
  140. end
  141. condition = RECLASSIFY ? "local_document_url IS NOT NULL" \
  142. : "local_document_url IS NOT NULL AND application_type IS NULL"
  143. rows = DB.client.query(
  144. "SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}",
  145. symbolize_keys: true
  146. ).to_a
  147. next if rows.empty?
  148. Log.info "classify", "#{table}: #{rows.length} row(s) to classify"
  149. rows.each do |row|
  150. ref = row[:council_reference]
  151. local_url = row[:local_document_url]
  152. text = extract_pdf_text(local_url)
  153. if text.nil? || text.strip.empty?
  154. Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}"
  155. total_errors += 1
  156. next
  157. end
  158. Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..."
  159. raw_response = llm_classify(text)
  160. if raw_response.nil?
  161. Log.warn "classify", "#{table} #{ref}: LLM returned nil"
  162. total_errors += 1
  163. next
  164. end
  165. classified = normalise_type(raw_response)
  166. Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})"
  167. unless DRY_RUN
  168. update_sql = <<~SQL
  169. UPDATE `#{esc}`
  170. SET application_type = ?,
  171. application_type_raw = ?,
  172. application_type_at = NOW()
  173. WHERE id = ?
  174. SQL
  175. stmt = DB.client.prepare(update_sql)
  176. stmt.execute(classified, raw_response[0, 65_535], row[:id])
  177. end
  178. total_classified += 1
  179. end
  180. rescue Mysql2::Error => e
  181. Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}"
  182. total_errors += 1
  183. end
  184. puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"