|
|
@@ -0,0 +1,219 @@
|
|
|
+# tools/classify_pdfs.rb
|
|
|
+#
|
|
|
+# LLM-based classification of downloaded DA PDFs.
|
|
|
+#
|
|
|
+# Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows
|
|
|
+# where local_document_url IS NOT NULL and application_type IS NULL, extracts
|
|
|
+# text from the PDF with pdftotext, and asks a local Ollama model to classify
|
|
|
+# the application type.
|
|
|
+#
|
|
|
+# Usage:
|
|
|
+# ruby /app/tools/classify_pdfs.rb
|
|
|
+#
|
|
|
+# Environment variables:
|
|
|
+# ONLY_TABLE — process a single table, e.g. da_northernmidlands
|
|
|
+# RECLASSIFY — set to "1" to overwrite existing application_type values
|
|
|
+# LLM_MODEL — Ollama model name (default: llama3.2)
|
|
|
+# LLAMA_URL — Ollama base URL (default: http://192.168.8.73:11434)
|
|
|
+# DOWNLOAD_DIR — where PDFs are stored (default: /app/downloads)
|
|
|
+# DRY_RUN — set to "1" to print classifications without writing to DB
|
|
|
+# LOG_LEVEL — debug | info (default) | warn | error
|
|
|
+
|
|
|
+require "json"
|
|
|
+require "net/http"
|
|
|
+require "open3"
|
|
|
+require "uri"
|
|
|
+
|
|
|
+require_relative "../lib/db"
|
|
|
+require_relative "../lib/log"
|
|
|
+
|
|
|
+LLAMA_URL = ENV.fetch("LLAMA_URL", "http://192.168.8.73:11434")
|
|
|
+LLM_MODEL = ENV.fetch("LLM_MODEL", "llama3.2")
|
|
|
+DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads")
|
|
|
+DRY_RUN = ENV["DRY_RUN"] == "1"
|
|
|
+RECLASSIFY = ENV["RECLASSIFY"] == "1"
|
|
|
+ONLY_TABLE = ENV["ONLY_TABLE"]
|
|
|
+
|
|
|
+APPLICATION_TYPES = %w[
|
|
|
+ Residential
|
|
|
+ Commercial
|
|
|
+ Industrial
|
|
|
+ Subdivision
|
|
|
+ Rural/Agriculture
|
|
|
+ Tourism/Visitor\ Accommodation
|
|
|
+ Outbuilding/Shed
|
|
|
+ Change\ of\ Use
|
|
|
+ Demolition
|
|
|
+ Signage
|
|
|
+ Other
|
|
|
+].freeze
|
|
|
+
|
|
|
+PROMPT_TEMPLATE = <<~PROMPT
|
|
|
+ You are classifying a Tasmanian planning development application.
|
|
|
+ Read the following text and return ONLY the single most appropriate
|
|
|
+ application type from this list:
|
|
|
+ Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
|
|
|
+ Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
|
|
|
+ Demolition, Signage, Other
|
|
|
+
|
|
|
+ Text:
|
|
|
+ %s
|
|
|
+
|
|
|
+ Reply with the type only. No explanation.
|
|
|
+PROMPT
|
|
|
+
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+# PDF text extraction
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+def extract_pdf_text(local_url, max_chars: 2000)
|
|
|
+ # local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf"
|
|
|
+ # Map to filesystem path: /files/ → DOWNLOAD_DIR/
|
|
|
+ fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/")
|
|
|
+ unless File.exist?(fs_path)
|
|
|
+ Log.warn "classify", "PDF not found: #{fs_path}"
|
|
|
+ return nil
|
|
|
+ end
|
|
|
+
|
|
|
+ # -l 3: only first 3 pages (sufficient for cover/description page)
|
|
|
+ text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
|
|
|
+ unless status.success?
|
|
|
+ Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}"
|
|
|
+ return nil
|
|
|
+ end
|
|
|
+
|
|
|
+ text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
|
|
|
+rescue StandardError => e
|
|
|
+ Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}"
|
|
|
+ nil
|
|
|
+end
|
|
|
+
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+# LLM call
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+def llm_classify(text)
|
|
|
+ prompt = PROMPT_TEMPLATE % text
|
|
|
+ uri = URI("#{LLAMA_URL}/api/generate")
|
|
|
+ body = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false)
|
|
|
+
|
|
|
+ res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http|
|
|
|
+ http.post(uri.path, body, "Content-Type" => "application/json")
|
|
|
+ end
|
|
|
+
|
|
|
+ unless res.is_a?(Net::HTTPSuccess)
|
|
|
+ Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}"
|
|
|
+ return nil
|
|
|
+ end
|
|
|
+
|
|
|
+ JSON.parse(res.body)["response"].to_s.strip
|
|
|
+rescue StandardError => e
|
|
|
+ Log.warn "classify", "LLM error: #{e.class} #{e.message}"
|
|
|
+ nil
|
|
|
+end
|
|
|
+
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+# Normalise raw LLM response to one of the known types
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+def normalise_type(raw)
|
|
|
+ return nil if raw.nil? || raw.strip.empty?
|
|
|
+
|
|
|
+ # Strip any <think>...</think> tags (Qwen3 thinking mode artefact)
|
|
|
+ cleaned = raw.gsub(/<think>.*?<\/think>/m, "").strip
|
|
|
+
|
|
|
+ # Strip leading/trailing punctuation and whitespace
|
|
|
+ candidate = cleaned.split(/[\n.!?]/).first.to_s.strip
|
|
|
+
|
|
|
+ # Case-insensitive match against known types
|
|
|
+ APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } ||
|
|
|
+ APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } ||
|
|
|
+ "Other"
|
|
|
+end
|
|
|
+
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+# Tables to process
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+def target_tables
|
|
|
+ if ONLY_TABLE
|
|
|
+ DB.validate_table_name!(ONLY_TABLE)
|
|
|
+ [ONLY_TABLE]
|
|
|
+ else
|
|
|
+ rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'")
|
|
|
+ rs.map { |r| r.values.first }
|
|
|
+ end
|
|
|
+end
|
|
|
+
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+# Main
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
+Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})"
|
|
|
+
|
|
|
+total_classified = 0
|
|
|
+total_skipped = 0
|
|
|
+total_errors = 0
|
|
|
+
|
|
|
+target_tables.each do |table|
|
|
|
+ esc = DB.client.escape(table)
|
|
|
+
|
|
|
+ # Check the table has the classification columns (may not after a fresh
|
|
|
+ # migrate if running before migrate has been applied)
|
|
|
+ begin
|
|
|
+ DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0")
|
|
|
+ rescue Mysql2::Error
|
|
|
+ Log.warn "classify", "#{table} missing application_type column — run migrate.rb first"
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ condition = RECLASSIFY ? "local_document_url IS NOT NULL" \
|
|
|
+ : "local_document_url IS NOT NULL AND application_type IS NULL"
|
|
|
+
|
|
|
+ rows = DB.client.query(
|
|
|
+ "SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}",
|
|
|
+ symbolize_keys: true
|
|
|
+ ).to_a
|
|
|
+
|
|
|
+ next if rows.empty?
|
|
|
+
|
|
|
+ Log.info "classify", "#{table}: #{rows.length} row(s) to classify"
|
|
|
+
|
|
|
+ rows.each do |row|
|
|
|
+ ref = row[:council_reference]
|
|
|
+ local_url = row[:local_document_url]
|
|
|
+
|
|
|
+ text = extract_pdf_text(local_url)
|
|
|
+ if text.nil? || text.strip.empty?
|
|
|
+ Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}"
|
|
|
+ total_errors += 1
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..."
|
|
|
+ raw_response = llm_classify(text)
|
|
|
+
|
|
|
+ if raw_response.nil?
|
|
|
+ Log.warn "classify", "#{table} #{ref}: LLM returned nil"
|
|
|
+ total_errors += 1
|
|
|
+ next
|
|
|
+ end
|
|
|
+
|
|
|
+ classified = normalise_type(raw_response)
|
|
|
+ Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})"
|
|
|
+
|
|
|
+ unless DRY_RUN
|
|
|
+ update_sql = <<~SQL
|
|
|
+ UPDATE `#{esc}`
|
|
|
+ SET application_type = ?,
|
|
|
+ application_type_raw = ?,
|
|
|
+ application_type_at = NOW()
|
|
|
+ WHERE id = ?
|
|
|
+ SQL
|
|
|
+ stmt = DB.client.prepare(update_sql)
|
|
|
+ stmt.execute(classified, raw_response[0, 65_535], row[:id])
|
|
|
+ end
|
|
|
+
|
|
|
+ total_classified += 1
|
|
|
+ end
|
|
|
+rescue Mysql2::Error => e
|
|
|
+ Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}"
|
|
|
+ total_errors += 1
|
|
|
+end
|
|
|
+
|
|
|
+puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"
|