| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- # tools/classify_pdfs.rb
- #
- # LLM-based classification of downloaded DA PDFs.
- #
- # Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows
- # where local_document_url IS NOT NULL and application_type IS NULL, extracts
- # text from the PDF with pdftotext, and asks a local Ollama model to classify
- # the application type.
- #
- # Usage:
- # ruby /app/tools/classify_pdfs.rb
- #
- # Environment variables:
- # ONLY_TABLE — process a single table, e.g. da_northernmidlands
- # RECLASSIFY — set to "1" to overwrite existing application_type values
- # LLM_MODEL — Ollama model name (default: llama3.2)
- # LLAMA_URL — Ollama base URL (default: http://192.168.8.73:11434)
- # DOWNLOAD_DIR — where PDFs are stored (default: /app/downloads)
- # DRY_RUN — set to "1" to print classifications without writing to DB
- # LOG_LEVEL — debug | info (default) | warn | error
- require "json"
- require "net/http"
- require "open3"
- require "uri"
- require_relative "../lib/db"
- require_relative "../lib/log"
- LLAMA_URL = ENV.fetch("LLAMA_URL", "http://192.168.8.73:11434")
- LLM_MODEL = ENV.fetch("LLM_MODEL", "llama3.2")
- DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads")
- DRY_RUN = ENV["DRY_RUN"] == "1"
- RECLASSIFY = ENV["RECLASSIFY"] == "1"
- ONLY_TABLE = ENV["ONLY_TABLE"]
- APPLICATION_TYPES = %w[
- Residential
- Commercial
- Industrial
- Subdivision
- Rural/Agriculture
- Tourism/Visitor\ Accommodation
- Outbuilding/Shed
- Change\ of\ Use
- Demolition
- Signage
- Other
- ].freeze
- PROMPT_TEMPLATE = <<~PROMPT
- You are classifying a Tasmanian planning development application.
- Read the following text and return ONLY the single most appropriate
- application type from this list:
- Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
- Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
- Demolition, Signage, Other
- Text:
- %s
- Reply with the type only. No explanation.
- PROMPT
- # ---------------------------------------------------------------------------
- # PDF text extraction
- # ---------------------------------------------------------------------------
- def extract_pdf_text(local_url, max_chars: 2000)
- # local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf"
- # Map to filesystem path: /files/ → DOWNLOAD_DIR/
- fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/")
- unless File.exist?(fs_path)
- Log.warn "classify", "PDF not found: #{fs_path}"
- return nil
- end
- # -l 3: only first 3 pages (sufficient for cover/description page)
- text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
- unless status.success?
- Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}"
- return nil
- end
- text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
- rescue StandardError => e
- Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}"
- nil
- end
- # ---------------------------------------------------------------------------
- # LLM call
- # ---------------------------------------------------------------------------
- def llm_classify(text)
- prompt = PROMPT_TEMPLATE % text
- uri = URI("#{LLAMA_URL}/api/generate")
- body = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false)
- res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http|
- http.post(uri.path, body, "Content-Type" => "application/json")
- end
- unless res.is_a?(Net::HTTPSuccess)
- Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}"
- return nil
- end
- JSON.parse(res.body)["response"].to_s.strip
- rescue StandardError => e
- Log.warn "classify", "LLM error: #{e.class} #{e.message}"
- nil
- end
- # ---------------------------------------------------------------------------
- # Normalise raw LLM response to one of the known types
- # ---------------------------------------------------------------------------
- def normalise_type(raw)
- return nil if raw.nil? || raw.strip.empty?
- # Strip any <think>...</think> tags (Qwen3 thinking mode artefact)
- cleaned = raw.gsub(/<think>.*?<\/think>/m, "").strip
- # Strip leading/trailing punctuation and whitespace
- candidate = cleaned.split(/[\n.!?]/).first.to_s.strip
- # Case-insensitive match against known types
- APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } ||
- APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } ||
- "Other"
- end
- # ---------------------------------------------------------------------------
- # Tables to process
- # ---------------------------------------------------------------------------
- def target_tables
- if ONLY_TABLE
- DB.validate_table_name!(ONLY_TABLE)
- [ONLY_TABLE]
- else
- rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'")
- rs.map { |r| r.values.first }
- end
- end
- # ---------------------------------------------------------------------------
- # Main
- # ---------------------------------------------------------------------------
- Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})"
- total_classified = 0
- total_skipped = 0
- total_errors = 0
- target_tables.each do |table|
- esc = DB.client.escape(table)
- # Check the table has the classification columns (may not after a fresh
- # migrate if running before migrate has been applied)
- begin
- DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0")
- rescue Mysql2::Error
- Log.warn "classify", "#{table} missing application_type column — run migrate.rb first"
- next
- end
- condition = RECLASSIFY ? "local_document_url IS NOT NULL" \
- : "local_document_url IS NOT NULL AND application_type IS NULL"
- rows = DB.client.query(
- "SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}",
- symbolize_keys: true
- ).to_a
- next if rows.empty?
- Log.info "classify", "#{table}: #{rows.length} row(s) to classify"
- rows.each do |row|
- ref = row[:council_reference]
- local_url = row[:local_document_url]
- text = extract_pdf_text(local_url)
- if text.nil? || text.strip.empty?
- Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}"
- total_errors += 1
- next
- end
- Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..."
- raw_response = llm_classify(text)
- if raw_response.nil?
- Log.warn "classify", "#{table} #{ref}: LLM returned nil"
- total_errors += 1
- next
- end
- classified = normalise_type(raw_response)
- Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})"
- unless DRY_RUN
- update_sql = <<~SQL
- UPDATE `#{esc}`
- SET application_type = ?,
- application_type_raw = ?,
- application_type_at = NOW()
- WHERE id = ?
- SQL
- stmt = DB.client.prepare(update_sql)
- stmt.execute(classified, raw_response[0, 65_535], row[:id])
- end
- total_classified += 1
- end
- rescue Mysql2::Error => e
- Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}"
- total_errors += 1
- end
- puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"
|