Parcourir la source

Initial LLM Classifier

Benjamin Harris il y a 2 mois
Parent
commit
f979d88c48
6 fichiers modifiés avec 278 ajouts et 2 suppressions
  1. 1 1
      CLAUDE.md
  2. 2 1
      Dockerfile
  3. 3 0
      lib/db.rb
  4. 21 0
      lib/migrate.rb
  5. 219 0
      tools/classify_pdfs.rb
  6. 32 0
      web/index.php

+ 1 - 1
CLAUDE.md

@@ -144,7 +144,7 @@ The shared infrastructure (`Http`, `DB`, `enrich_after_upsert!`) handles everyth
 - Schema changes go in `lib/migrate.rb` (new migration at end of `MIGRATIONS` array) or `lib/db.rb` (`ensure_table!`) for columns every new table gets
 - The `geo_cache` table stores geocoding results keyed by SHA1 of the normalised query string — avoids redundant Google API calls
 - The `UNIQUE KEY uniq_ref_addr (council_reference, address)` constraint drives the upsert behaviour
-- Current migration versions: v1 (enrichment/geocode columns), v2 (geo_cache table), v3 (documents_json), v4 (Launceston detail columns), v5 (rewrite /downloads/ → /files/ in local_document_url)
+- Current migration versions: v1 (enrichment/geocode columns), v2 (geo_cache table), v3 (documents_json), v4 (Launceston detail columns), v5 (rewrite /downloads/ → /files/ in local_document_url), v6 (LLM classification columns)
 
 ### Schema — notable columns added beyond base
 

+ 2 - 1
Dockerfile

@@ -12,7 +12,8 @@ RUN apt-get update && \
       libmariadb-dev \
       mariadb-client \
       ca-certificates \
-      curl && \
+      curl \
+      poppler-utils && \
     rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app

+ 3 - 0
lib/db.rb

@@ -64,6 +64,9 @@ module DB
             advertised_on DATE NULL,
             advertised_on_raw VARCHAR(80) NULL,
             property_legal_description TEXT NULL,
+            application_type VARCHAR(60) NULL,
+            application_type_raw TEXT NULL,
+            application_type_at DATETIME NULL,
             lat DECIMAL(10,7) NULL,
             lng DECIMAL(10,7) NULL,
             PRIMARY KEY (id),

+ 21 - 0
lib/migrate.rb

@@ -140,6 +140,27 @@ module Migrate
           Log.warn "migrate", "skipped #{table}: #{e.message}"
         end
       }
+    },
+    {
+      version: 6,
+      description: "Add LLM classification columns (application_type, application_type_raw, application_type_at)",
+      up: -> {
+        cols = {
+          "application_type"     => "VARCHAR(60) NULL",
+          "application_type_raw" => "TEXT NULL",
+          "application_type_at"  => "DATETIME NULL"
+        }
+        Migrate.da_tables.each do |table|
+          esc = DB.client.escape(table)
+          cols.each do |col, defn|
+            DB.client.query(
+              "ALTER TABLE `#{esc}` ADD COLUMN IF NOT EXISTS `#{col}` #{defn}"
+            )
+          rescue Mysql2::Error => e
+            Log.warn "migrate", "skipped #{table}.#{col}: #{e.message}"
+          end
+        end
+      }
     }
   ].freeze
 

+ 219 - 0
tools/classify_pdfs.rb

@@ -0,0 +1,219 @@
+# tools/classify_pdfs.rb
+#
+# LLM-based classification of downloaded DA PDFs.
+#
+# Iterates all da_* tables (or a single table via ONLY_TABLE) looking for rows
+# where local_document_url IS NOT NULL and application_type IS NULL, extracts
+# text from the PDF with pdftotext, and asks a local Ollama model to classify
+# the application type.
+#
+# Usage:
+#   ruby /app/tools/classify_pdfs.rb
+#
+# Environment variables:
+#   ONLY_TABLE      — process a single table, e.g. da_northernmidlands
+#   RECLASSIFY      — set to "1" to overwrite existing application_type values
+#   LLM_MODEL       — Ollama model name (default: llama3.2)
+#   LLAMA_URL       — Ollama base URL (default: http://192.168.8.73:11434)
+#   DOWNLOAD_DIR    — where PDFs are stored (default: /app/downloads)
+#   DRY_RUN         — set to "1" to print classifications without writing to DB
+#   LOG_LEVEL       — debug | info (default) | warn | error
+
+require "json"
+require "net/http"
+require "open3"
+require "uri"
+
+require_relative "../lib/db"
+require_relative "../lib/log"
+
+LLAMA_URL    = ENV.fetch("LLAMA_URL",    "http://192.168.8.73:11434")
+LLM_MODEL    = ENV.fetch("LLM_MODEL",    "llama3.2")
+DOWNLOAD_DIR = ENV.fetch("DOWNLOAD_DIR", "/app/downloads")
+DRY_RUN      = ENV["DRY_RUN"]    == "1"
+RECLASSIFY   = ENV["RECLASSIFY"] == "1"
+ONLY_TABLE   = ENV["ONLY_TABLE"]
+
+APPLICATION_TYPES = %w[
+  Residential
+  Commercial
+  Industrial
+  Subdivision
+  Rural/Agriculture
+  Tourism/Visitor\ Accommodation
+  Outbuilding/Shed
+  Change\ of\ Use
+  Demolition
+  Signage
+  Other
+].freeze
+
+PROMPT_TEMPLATE = <<~PROMPT
+  You are classifying a Tasmanian planning development application.
+  Read the following text and return ONLY the single most appropriate
+  application type from this list:
+    Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
+    Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
+    Demolition, Signage, Other
+
+  Text:
+  %s
+
+  Reply with the type only. No explanation.
+PROMPT
+
+# ---------------------------------------------------------------------------
+# PDF text extraction
+# ---------------------------------------------------------------------------
+def extract_pdf_text(local_url, max_chars: 2000)
+  # local_url is like "/files/northernmidlands/PLN-26-0030/doc.pdf"
+  # Map to filesystem path: /files/ → DOWNLOAD_DIR/
+  fs_path = local_url.to_s.sub(%r{\A/files/}, "#{DOWNLOAD_DIR}/")
+  unless File.exist?(fs_path)
+    Log.warn "classify", "PDF not found: #{fs_path}"
+    return nil
+  end
+
+  # -l 3: only first 3 pages (sufficient for cover/description page)
+  text, status = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
+  unless status.success?
+    Log.warn "classify", "pdftotext failed (exit #{status.exitstatus}) for #{fs_path}"
+    return nil
+  end
+
+  text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
+rescue StandardError => e
+  Log.warn "classify", "extract_pdf_text error for #{fs_path}: #{e.class} #{e.message}"
+  nil
+end
+
+# ---------------------------------------------------------------------------
+# LLM call
+# ---------------------------------------------------------------------------
+def llm_classify(text)
+  prompt = PROMPT_TEMPLATE % text
+  uri    = URI("#{LLAMA_URL}/api/generate")
+  body   = JSON.generate(model: LLM_MODEL, prompt: prompt, stream: false)
+
+  res = Net::HTTP.start(uri.host, uri.port, open_timeout: 10, read_timeout: 120) do |http|
+    http.post(uri.path, body, "Content-Type" => "application/json")
+  end
+
+  unless res.is_a?(Net::HTTPSuccess)
+    Log.warn "classify", "Ollama returned #{res.code}: #{res.body.to_s[0, 200]}"
+    return nil
+  end
+
+  JSON.parse(res.body)["response"].to_s.strip
+rescue StandardError => e
+  Log.warn "classify", "LLM error: #{e.class} #{e.message}"
+  nil
+end
+
+# ---------------------------------------------------------------------------
+# Normalise raw LLM response to one of the known types
+# ---------------------------------------------------------------------------
+def normalise_type(raw)
+  return nil if raw.nil? || raw.strip.empty?
+
+  # Strip any <think>...</think> tags (Qwen3 thinking mode artefact)
+  cleaned = raw.gsub(/<think>.*?<\/think>/m, "").strip
+
+  # Strip leading/trailing punctuation and whitespace
+  candidate = cleaned.split(/[\n.!?]/).first.to_s.strip
+
+  # Case-insensitive match against known types
+  APPLICATION_TYPES.find { |t| t.casecmp?(candidate) } ||
+    APPLICATION_TYPES.find { |t| candidate.downcase.include?(t.downcase) } ||
+    "Other"
+end
+
+# ---------------------------------------------------------------------------
+# Tables to process
+# ---------------------------------------------------------------------------
+def target_tables
+  if ONLY_TABLE
+    DB.validate_table_name!(ONLY_TABLE)
+    [ONLY_TABLE]
+  else
+    rs = DB.client.query("SHOW TABLES LIKE 'da\\_%'")
+    rs.map { |r| r.values.first }
+  end
+end
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+Log.info "classify", "Starting PDF classification (model: #{LLM_MODEL}, reclassify: #{RECLASSIFY}, dry_run: #{DRY_RUN})"
+
+total_classified = 0
+total_skipped    = 0
+total_errors     = 0
+
+target_tables.each do |table|
+  esc = DB.client.escape(table)
+
+  # Check the table has the classification columns (may not after a fresh
+  # migrate if running before migrate has been applied)
+  begin
+    DB.client.query("SELECT application_type FROM `#{esc}` LIMIT 0")
+  rescue Mysql2::Error
+    Log.warn "classify", "#{table} missing application_type column — run migrate.rb first"
+    next
+  end
+
+  condition = RECLASSIFY ? "local_document_url IS NOT NULL" \
+                         : "local_document_url IS NOT NULL AND application_type IS NULL"
+
+  rows = DB.client.query(
+    "SELECT id, council_reference, address, local_document_url FROM `#{esc}` WHERE #{condition}",
+    symbolize_keys: true
+  ).to_a
+
+  next if rows.empty?
+
+  Log.info "classify", "#{table}: #{rows.length} row(s) to classify"
+
+  rows.each do |row|
+    ref      = row[:council_reference]
+    local_url = row[:local_document_url]
+
+    text = extract_pdf_text(local_url)
+    if text.nil? || text.strip.empty?
+      Log.warn "classify", "#{table} #{ref}: no text extracted from #{local_url}"
+      total_errors += 1
+      next
+    end
+
+    Log.info "classify", "#{table} #{ref}: extracted #{text.length} chars, calling LLM..."
+    raw_response = llm_classify(text)
+
+    if raw_response.nil?
+      Log.warn "classify", "#{table} #{ref}: LLM returned nil"
+      total_errors += 1
+      next
+    end
+
+    classified = normalise_type(raw_response)
+    Log.info "classify", "#{table} #{ref}: #{classified.inspect} (raw: #{raw_response.to_s[0, 80].inspect})"
+
+    unless DRY_RUN
+      update_sql = <<~SQL
+        UPDATE `#{esc}`
+           SET application_type     = ?,
+               application_type_raw = ?,
+               application_type_at  = NOW()
+         WHERE id = ?
+      SQL
+      stmt = DB.client.prepare(update_sql)
+      stmt.execute(classified, raw_response[0, 65_535], row[:id])
+    end
+
+    total_classified += 1
+  end
+rescue Mysql2::Error => e
+  Log.warn "classify", "DB error on #{table}: #{e.class} #{e.message}"
+  total_errors += 1
+end
+
+puts "Done. Classified: #{total_classified}, Skipped/no-text: #{total_skipped}, Errors: #{total_errors}#{DRY_RUN ? " (DRY RUN)" : ""}"

+ 32 - 0
web/index.php

@@ -19,9 +19,16 @@ $pdo = new PDO($dsn, $DB_USER, $DB_PASS, [
 // ---- Inputs ----
 $q             = trim((string)($_GET['q'] ?? ''));
 $councilKeySel = trim((string)($_GET['council_key'] ?? '')); // table name like da_meandervalley
+$appTypeSel    = trim((string)($_GET['app_type'] ?? ''));
 $includeClosed = isset($_GET['include_closed']);
 $sort          = (string)($_GET['sort'] ?? 'close');         // close|council|address|ref
 
+$knownAppTypes = [
+    'Residential', 'Commercial', 'Industrial', 'Subdivision',
+    'Rural/Agriculture', 'Tourism/Visitor Accommodation',
+    'Outbuilding/Shed', 'Change of Use', 'Demolition', 'Signage', 'Other',
+];
+
 // ---- Discover da_* tables ----
 $allTables = [];
 $st = $pdo->query("SHOW TABLES");
@@ -193,6 +200,7 @@ foreach ($tables as $t) {
     $cols[] = tableHasColumn($pdo, $t, 'document_url')        ? "COALESCE(document_url,'') AS document_url"     : "'' AS document_url";
     $cols[] = tableHasColumn($pdo, $t, 'local_document_url')  ? "COALESCE(local_document_url,'') AS local_document_url" : "'' AS local_document_url";
     $cols[] = tableHasColumn($pdo, $t, 'documents_json')      ? "documents_json"                                : "NULL AS documents_json";
+    $cols[] = tableHasColumn($pdo, $t, 'application_type')    ? "application_type"                              : "NULL AS application_type";
 
     validate_table_name($t);
     $selects[] = "SELECT ".implode(", ", $cols)." FROM `{$t}`";
@@ -212,6 +220,10 @@ if ($q !== '') {
     $like = "%{$q}%";
     array_push($params, $like, $like, $like, $like);
 }
+if ($appTypeSel !== '' && in_array($appTypeSel, $knownAppTypes, true)) {
+    $where[] = "x.application_type = ?";
+    $params[] = $appTypeSel;
+}
 if ($where) $sql .= " WHERE ".implode(" AND ", $where);
 
 // Sort
@@ -256,6 +268,7 @@ $rows = $st->fetchAll();
             .badge-today  { background:#ffc107; }
             .badge-closed  { background:#dc3545; }
             .badge-unknown  { background:#ced4da; }
+            .badge-apptype  { background:#0d6efd; }
             .muted { color:#6c757d; }
             .accordion-button .meta { margin-left:auto; display:flex; gap:.5rem; align-items:center; }
             .nowrap { white-space:nowrap; }
@@ -279,6 +292,14 @@ $rows = $st->fetchAll();
                         <?php endforeach; ?>
                     </select>
                 </div>
+                <div class="col-auto">
+                    <select name="app_type" class="form-select">
+                        <option value="">All types</option>
+                        <?php foreach ($knownAppTypes as $at): ?>
+                        <option value="<?= h($at) ?>" <?= $at === $appTypeSel ? 'selected' : '' ?>><?= h($at) ?></option>
+                        <?php endforeach; ?>
+                    </select>
+                </div>
                 <div class="col-auto form-check">
                     <input class="form-check-input" type="checkbox" name="include_closed" value="1" id="incClosed" <?= $includeClosed ? 'checked' : '' ?>>
                     <label class="form-check-label" for="incClosed">include closed</label>
@@ -324,6 +345,8 @@ $rows = $st->fetchAll();
                                $statusBadge = '<span class="badge badge-unknown">unknown</span>';
                            }
 
+                           $appType = trim((string)($r['application_type'] ?? ''));
+
                            // prefer local file, fallback to council URL
                            $docLocal = trim((string)($r['local_document_url'] ?? ''));
                            $docWeb   = trim((string)($r['document_url'] ?? ''));
@@ -360,6 +383,9 @@ $rows = $st->fetchAll();
                                 <div class="col"><span class="badge text-bg-light border nowrap">Close <?= $closeLabel ?></span></div>
                                 <?php endif; ?>
                                 <div class="col me-1"><?= $statusBadge ?></div>
+                                <?php if ($appType !== ''): ?>
+                                <div class="col"><span class="badge badge-apptype"><?= h($appType) ?></span></div>
+                                <?php endif; ?>
                             </div>
                         </button>
                     </h2>
@@ -410,6 +436,12 @@ $rows = $st->fetchAll();
                                     <label class="form-label">Days remaining</label>
                                     <input type="text" class="form-control form-control-sm" value="<?= is_null($days) ? '' : $days ?>" disabled readonly>
                                 </div>
+                                <?php if ($appType !== ''): ?>
+                                <div class="col-md-4">
+                                    <label class="form-label">Application type</label>
+                                    <input type="text" class="form-control form-control-sm" value="<?= h($appType) ?>" disabled readonly>
+                                </div>
+                                <?php endif; ?>
                                 <div class="col-12">
                                     <?php if (!empty($docList)): ?>
                                     <div class="d-flex flex-wrap gap-2">