# tools/classify_pdfs.rb # # LLM-based classification of downloaded DA PDFs. # # Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows # where local_document_url IS NOT NULL and application_type IS NULL, extracts # text from the PDF with pdftotext, and asks a local Ollama model to classify # the application type. # # Usage: # ruby /app/tools/classify_pdfs.rb # # Environment variables: # ONLY_TABLE — process a single table, e.g. da_northernmidlands # RECLASSIFY — set to "1" to overwrite existing application_type values # LLM_MODEL — Ollama model name (default: llama3.2) # LLAMA_URL — Ollama base URL (default: http://192.168.8.73:11434) # DOWNLOAD_DIR — where PDFs are stored (default: /app/downloads) # DRY_RUN — set to "1" to print classifications without writing to DB # LOG_LEVEL — debug | info (default) | warn | error require "json" require "net/http" require "open3" require "uri" require_relative "../lib/db" require_relative "../lib/log" LLAMA_URL = ENV.fetch("LLAMA_URL", "http://192.168.8.73:11434") LLM_MODEL = ENV.fetch("LLM_MODEL", "llama3.2") DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads") DRY_RUN = ENV["DRY_RUN"] == "1" RECLASSIFY = ENV["RECLASSIFY"] == "1" ONLY_TABLE = ENV["ONLY_TABLE"] APPLICATION_TYPES = %w[ Residential Commercial Industrial Subdivision Rural/Agriculture Tourism/Visitor\ Accommodation Outbuilding/Shed Change\ of\ Use Demolition Signage Other ].freeze PROMPT_TEMPLATE = <<~PROMPT You are classifying a Tasmanian planning development application. Read the following text and return ONLY the single most appropriate application type from this list: Residential, Commercial, Industrial, Subdivision, Rural/Agriculture, Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use, Demolition, Signage, Other Text: %s Reply with the type only. No explanation. PROMPT # --------------------------------------------------------------------------- # PDF text extraction # --------------------------------------------------------------------------- def extract_pdf_text(local_url, max_chars: 2000) # local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf" # Map to filesystem path: /files/ → DOWNLOAD_DIR/ fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/") unless File.exist?(fs_path) Log.warn "classify", "PDF not found: #{fs_path}" return nil end # -l 3: only first 3 pages (sufficient for cover/description page) text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-") unless status.success? Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}" return nil end text.to_s.gsub(/\s+/, " ").strip[0, max_chars] rescue StandardError => e Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}" nil end # --------------------------------------------------------------------------- # LLM call # --------------------------------------------------------------------------- def llm_classify(text) prompt = PROMPT_TEMPLATE % text uri = URI("#{LLAMA_URL}/api/generate") body = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false) res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http| http.post(uri.path, body, "Content-Type" => "application/json") end unless res.is_a?(Net::HTTPSuccess) Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}" return nil end JSON.parse(res.body)["response"].to_s.strip rescue StandardError => e Log.warn "classify", "LLM error: #{e.class} #{e.message}" nil end # --------------------------------------------------------------------------- # Normalise raw LLM response to one of the known types # --------------------------------------------------------------------------- def normalise_type(raw) return nil if raw.nil? || raw.strip.empty? # Strip any ... tags (Qwen3 thinking mode artefact) cleaned = raw.gsub(/.*?<\/think>/m, "").strip # Strip leading/trailing punctuation and whitespace candidate = cleaned.split(/[\n.!?]/).first.to_s.strip # Case-insensitive match against known types APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } || APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } || "Other" end # --------------------------------------------------------------------------- # Tables to process # --------------------------------------------------------------------------- def target_tables if ONLY_TABLE DB.validate_table_name!(ONLY_TABLE) [ONLY_TABLE] else rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'") rs.map { |r| r.values.first } end end # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})" total_classified = 0 total_skipped = 0 total_errors = 0 target_tables.each do |table| esc = DB.client.escape(table) # Check the table has the classification columns (may not after a fresh # migrate if running before migrate has been applied) begin DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0") rescue Mysql2::Error Log.warn "classify", "#{table} missing application_type column — run migrate.rb first" next end condition = RECLASSIFY ? "local_document_url IS NOT NULL" \ : "local_document_url IS NOT NULL AND application_type IS NULL" rows = DB.client.query( "SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}", symbolize_keys: true ).to_a next if rows.empty? Log.info "classify", "#{table}: #{rows.length} row(s) to classify" rows.each do |row| ref = row[:council_reference] local_url = row[:local_document_url] text = extract_pdf_text(local_url) if text.nil? || text.strip.empty? Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}" total_errors += 1 next end Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..." raw_response = llm_classify(text) if raw_response.nil? Log.warn "classify", "#{table} #{ref}: LLM returned nil" total_errors += 1 next end classified = normalise_type(raw_response) Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})" unless DRY_RUN update_sql = <<~SQL UPDATE `#{esc}` SET application_type = ?, application_type_raw = ?, application_type_at = NOW() WHERE id = ? SQL stmt = DB.client.prepare(update_sql) stmt.execute(classified, raw_response[0, 65_535], row[:id]) end total_classified += 1 end rescue Mysql2::Error => e Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}" total_errors += 1 end puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"