# tools/classify_pdfs.rb
#
# LLM-based classification of downloaded DA PDFs.
#
# Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows
# where local_document_url IS NOT NULL and application_type IS NULL, extracts
# text from the PDF with pdftotext, and asks a local Ollama model to classify
# the application type.
#
# Usage:
# ruby /app/tools/classify_pdfs.rb
#
# Environment variables:
# ONLY_TABLE — process a single table, e.g. da_northernmidlands
# RECLASSIFY — set to "1" to overwrite existing application_type values
# LLM_MODEL — Ollama model name (default: llama3.2)
# LLAMA_URL — Ollama base URL (default: http://192.168.8.73:11434)
# DOWNLOAD_DIR — where PDFs are stored (default: /app/downloads)
# DRY_RUN — set to "1" to print classifications without writing to DB
# LOG_LEVEL — debug | info (default) | warn | error
require "json"
require "net/http"
require "open3"
require "uri"
require_relative "../lib/db"
require_relative "../lib/log"
LLAMA_URL = ENV.fetch("LLAMA_URL", "http://192.168.8.73:11434")
LLM_MODEL = ENV.fetch("LLM_MODEL", "llama3.2")
DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads")
DRY_RUN = ENV["DRY_RUN"] == "1"
RECLASSIFY = ENV["RECLASSIFY"] == "1"
ONLY_TABLE = ENV["ONLY_TABLE"]
APPLICATION_TYPES = %w[
Residential
Commercial
Industrial
Subdivision
Rural/Agriculture
Tourism/Visitor\ Accommodation
Outbuilding/Shed
Change\ of\ Use
Demolition
Signage
Other
].freeze
PROMPT_TEMPLATE = <<~PROMPT
You are classifying a Tasmanian planning development application.
Read the following text and return ONLY the single most appropriate
application type from this list:
Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
Demolition, Signage, Other
Text:
%s
Reply with the type only. No explanation.
PROMPT
# ---------------------------------------------------------------------------
# PDF text extraction
# ---------------------------------------------------------------------------
def extract_pdf_text(local_url, max_chars: 2000)
# local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf"
# Map to filesystem path: /files/ → DOWNLOAD_DIR/
fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/")
unless File.exist?(fs_path)
Log.warn "classify", "PDF not found: #{fs_path}"
return nil
end
# -l 3: only first 3 pages (sufficient for cover/description page)
text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
unless status.success?
Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}"
return nil
end
text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
rescue StandardError => e
Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}"
nil
end
# ---------------------------------------------------------------------------
# LLM call
# ---------------------------------------------------------------------------
def llm_classify(text)
prompt = PROMPT_TEMPLATE % text
uri = URI("#{LLAMA_URL}/api/generate")
body = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false)
res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http|
http.post(uri.path, body, "Content-Type" => "application/json")
end
unless res.is_a?(Net::HTTPSuccess)
Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}"
return nil
end
JSON.parse(res.body)["response"].to_s.strip
rescue StandardError => e
Log.warn "classify", "LLM error: #{e.class} #{e.message}"
nil
end
# ---------------------------------------------------------------------------
# Normalise raw LLM response to one of the known types
# ---------------------------------------------------------------------------
def normalise_type(raw)
return nil if raw.nil? || raw.strip.empty?
# Strip any ... tags (Qwen3 thinking mode artefact)
cleaned = raw.gsub(/.*?<\/think>/m, "").strip
# Strip leading/trailing punctuation and whitespace
candidate = cleaned.split(/[\n.!?]/).first.to_s.strip
# Case-insensitive match against known types
APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } ||
APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } ||
"Other"
end
# ---------------------------------------------------------------------------
# Tables to process
# ---------------------------------------------------------------------------
def target_tables
if ONLY_TABLE
DB.validate_table_name!(ONLY_TABLE)
[ONLY_TABLE]
else
rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'")
rs.map { |r| r.values.first }
end
end
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})"
total_classified = 0
total_skipped = 0
total_errors = 0
target_tables.each do |table|
esc = DB.client.escape(table)
# Check the table has the classification columns (may not after a fresh
# migrate if running before migrate has been applied)
begin
DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0")
rescue Mysql2::Error
Log.warn "classify", "#{table} missing application_type column — run migrate.rb first"
next
end
condition = RECLASSIFY ? "local_document_url IS NOT NULL" \
: "local_document_url IS NOT NULL AND application_type IS NULL"
rows = DB.client.query(
"SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}",
symbolize_keys: true
).to_a
next if rows.empty?
Log.info "classify", "#{table}: #{rows.length} row(s) to classify"
rows.each do |row|
ref = row[:council_reference]
local_url = row[:local_document_url]
text = extract_pdf_text(local_url)
if text.nil? || text.strip.empty?
Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}"
total_errors += 1
next
end
Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..."
raw_response = llm_classify(text)
if raw_response.nil?
Log.warn "classify", "#{table} #{ref}: LLM returned nil"
total_errors += 1
next
end
classified = normalise_type(raw_response)
Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})"
unless DRY_RUN
update_sql = <<~SQL
UPDATE `#{esc}`
SET application_type = ?,
application_type_raw = ?,
application_type_at = NOW()
WHERE id = ?
SQL
stmt = DB.client.prepare(update_sql)
stmt.execute(classified, raw_response[0, 65_535], row[:id])
end
total_classified += 1
end
rescue Mysql2::Error => e
Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}"
total_errors += 1
end
puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"