2 mesi fa · 1be61a9694
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -16,10 +16,14 @@ This is a scraping pipeline that collects Tasmanian planning development applica
 
				 | `lib/enrich.rb` | `enrich_after_upsert!` — geocoding + property lookup after each DB write |
			
 
				 | `lib/util.rb` | `parse_aus_date`, council-name/table-name mappings |
			
 
				 | `lib/scraper_helpers.rb` | Shared helpers: `abs_url`, `text_or`, `upsert_and_enrich!` |
			
 
				-| `run_all.sh` | Discovers `scrapers/*.rb`, filters by `ONLY`/`SKIP`, runs each with `TABLE_NAME` set |
			
 
				+| `lib/migrate.rb` | Sequential schema migration runner — add new migrations at end of `MIGRATIONS` array |
			
 
				+| `lib/llm.php` | LLM inference helper for PHP — calls Ollama-compatible API (llama-swap primary, Ollama fallback) |
			
 
				+| `run_all.sh` | Discovers `scrapers/*.rb`, filters by `ONLY`/`SKIP`, runs each with `TABLE_NAME` set; prints summary table; emails on error |
			
 
				 | `entrypoint.sh` | Docker entry; waits for DB then runs `run_all.sh` (looping if `SCRAPE_EVERY_MINUTES` is set) |
			
 
				 | `scrapers/*.rb` | One scraper per council — parses HTML, upserts rows, calls `enrich_after_upsert!` |
			
 
				 | `web/index.php` | Search portal — dynamic UNION across all `da_*` tables |
			
 
				+| `tools/send_summary_email.rb` | Sends HTML error-summary email via SMTP (called by `run_all.sh` when any scraper ERRORs) |
			
 
				+| `tools/backfill_geocode.rb` | Batch geocode backfill for existing rows (supports `ONLY_TABLE`, `DRY_RUN`) |
			
 
				 
			
 
				 ---
			
 
				 
			
@@ -36,7 +40,7 @@ docker compose run --rm scraper /app/run_all.sh
 
				 TABLE_NAME=da_brighton DEBUG=1 ruby scrapers/brighton.rb
			
 
				 
			
 
				 # Run a subset
			
 
				-ONLY=meandervalley,kent docker compose run --rm scraper /app/run_all.sh
			
 
				+ONLY=meandervalley,westtamar docker compose run --rm scraper /app/run_all.sh
			
 
				 
			
 
				 # Geocode backfill (batch, all tables)
			
 
				 docker compose run --rm \
			
@@ -68,24 +72,41 @@ docker compose run --rm \
 
				 - Some councils (Kentish, Derwent Valley via direct site) use Cloudflare JS challenge which cannot be solved without a real browser. These exit cleanly with a warning. Where a PlanBuild equivalent exists (council code in `COUNCIL_MAP`), data is still collected via `planbuild.rb`.
			
 
				 - The warmup pattern (custom `CookieJar` + `http_get` with redirect handling) is self-contained in scrapers that need it and does **not** depend on `lib/http.rb`.
			
 
				 
			
 
				-### Write-once fields (in `DB.upsert`):
			
 
				+### PDF Downloads
			
 
				+
			
 
				+- Only happen when `DOWNLOAD_ATTACHMENTS=1` (set in `docker-compose.yml` or at runtime)
			
 
				+- Files land in `DOWNLOAD_DIR/<councilname>/<ref>/filename.pdf` inside the container
			
 
				+- The web container mounts the same folder at `/srv/files` and Apache serves it via `Alias /files /srv/files`
			
 
				+- **`local_document_url` must be stored as `/files/<councilname>/...`** — not `/downloads/...`. The Apache alias is `/files`, not `/downloads`.
			
 
				+- The web portal prefers `local_document_url` over `document_url` when rendering the document button
			
 
				+- For multi-document DAs (e.g. Launceston), all docs are stored as JSON in `documents_json` and rendered as a list of buttons in the portal
			
 
				+
			
 
				+### Write-once fields (in `DB.upsert`)
			
 
				+
			
 
				 - `date_received` — never overwritten once set
			
 
				 - `date_received_raw` — never overwritten once non-blank
			
 
				 - `document_url` / `local_document_url` — new value only replaces if existing is NULL
			
 
				 
			
 
				-### Table names:
			
 
				+### Table names
			
 
				+
			
 
				 - Always derived from the scraper filename: `scrapers/foo.rb` → `da_foo`
			
 
				 - `run_all.sh` sets `TABLE_NAME=da_<basename>` before invoking each scraper
			
 
				 - The `COUNCIL_MAP` in `lib/util.rb` maps internal council keys to table names (used by PlanBuild integration)
			
 
				 
			
 
				+### run_all.sh summary table
			
 
				+
			
 
				+- After all scrapers finish, prints a formatted table: Council | Saved | Warns | Status
			
 
				+- Status values: `ok`, `warn`, `blocked` (Cloudflare), `ERROR` (non-zero exit)
			
 
				+- Saved count: parsed from scraper stdout — looks for `"Saved N"` (case-insensitive) first, falls back to counting `"Upserted"` lines
			
 
				+- All scrapers should end with `puts "Done #{TABLE}. Saved #{n} item(s)."` for correct summary parsing
			
 
				+- If any scraper has ERROR status and `SMTP_HOST` is set, `tools/send_summary_email.rb` sends an HTML summary email
			
 
				+
			
 
				 ---
			
 
				 
			
 
				 ## Error Handling Conventions
			
 
				 
			
 
				-After a refactor, the project follows these rules:
			
 
				-
			
 
				 - **URI building** (`URI.join`, `URI.parse`) → `rescue URI::InvalidURIError`
			
 
				-- **DB operations** (prepare/execute) → `rescue Mysql2::Error => e; warn "[scraper] ..."`
			
 
				+- **DB operations** (prepare/execute) → `rescue Mysql2::Error => e; Log.warn ...`
			
 
				 - **Zlib decompression** → `rescue Zlib::Error`
			
 
				 - **Date parsing** (`Date.strptime`, `Date.parse`) → `rescue ArgumentError, Date::Error`
			
 
				 - **JSON parsing** → `rescue JSON::ParserError`
			
@@ -104,7 +125,10 @@ When a council changes its website markup, only that scraper needs updating. The
 
				 - `date_received` all nil — Date format changed; update the format string passed to `Util.parse_aus_date` or `Date.strptime`
			
 
				 
			
 
				 **Template choice:**
			
 
				-- Simple HTML list/table → copy `glamorgan.rb`
			
 
				+
			
 
				+- Simple HTML list/table with one entry per row → copy `glamorgan.rb`
			
 
				+- Single page, entries grouped under `<h2>` headings → copy `northernmidlands.rb`
			
 
				+- Single page, entries under `<h2>` with labeled `<strong>` fields + PDF in `<ul>` → copy `westtamar.rb`
			
 
				 - Link/PDF listing → copy `centralhighlands.rb`
			
 
				 - WAF-protected site needing homepage warmup → copy `kingisland.rb` (minimal) or `burnie.rb` (full-featured with PDF download)
			
 
				 - Multi-hop redirect to detail pages → copy `derwentvalley.rb`
			
@@ -120,12 +144,40 @@ The shared infrastructure (`Http`, `DB`, `enrich_after_upsert!`) handles everyth
 
				 - Schema changes go in `lib/migrate.rb` (new migration at end of `MIGRATIONS` array) or `lib/db.rb` (`ensure_table!`) for columns every new table gets
			
 
				 - The `geo_cache` table stores geocoding results keyed by SHA1 of the normalised query string — avoids redundant Google API calls
			
 
				 - The `UNIQUE KEY uniq_ref_addr (council_reference, address)` constraint drives the upsert behaviour
			
 
				+- Current migration versions: v1 (enrichment/geocode columns), v2 (geo_cache table), v3 (documents_json), v4 (Launceston detail columns), v5 (rewrite /downloads/ → /files/ in local_document_url)
			
 
				+
			
 
				+### Schema — notable columns added beyond base
			
 
				+
			
 
				+| Column | Type | Notes |
			
 
				+| --- | --- | --- |
			
 
				+| `documents_json` | MEDIUMTEXT | JSON array of `{name, url, local_url}` — used when a DA has multiple PDFs (e.g. Launceston) |
			
 
				+| `status` | VARCHAR(100) | Application status text (Launceston eProperty) |
			
 
				+| `assigned_officer` | VARCHAR(255) | Assigned planning officer (Launceston) |
			
 
				+| `group` | VARCHAR(100) | Application group (Launceston) — reserved SQL word, always quoted |
			
 
				+| `category` | VARCHAR(100) | Application category (Launceston) |
			
 
				+| `application_valid` | DATE | Date application deemed valid (Launceston) |
			
 
				+| `advertised_on` | DATE | Date first advertised (Launceston) |
			
 
				+| `property_legal_description` | TEXT | Certificate of Title / legal description (Launceston) |
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Web Portal Notes
			
 
				 
			
 
				 - `web/index.php` dynamically discovers all `da_*` tables and builds a UNION query
			
 
				 - It handles missing columns gracefully (not all tables have every column)
			
 
				-- `web/backfill_pid_title.php` is a legacy admin tool — it should not be publicly accessible; consider moving it out of the web root or placing it behind authentication
			
 
				+- Document display: if `documents_json` is present → renders a button per document using the name from JSON; otherwise falls back to single "Open document" button using `local_document_url` → `document_url`
			
 
				+- `web/backfill_pid_title.php` is a legacy admin tool — it should not be publicly accessible
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Email Summary
			
 
				+
			
 
				+`tools/send_summary_email.rb` is called by `run_all.sh` when any scraper exits with ERROR status. It:
			
 
				+
			
 
				+- Reads SMTP config from env vars: `SMTP_HOST`, `SMTP_PORT`, `SMTP_USERNAME`, `SMTP_PASSWORD`, `SMTP_SMTPSecure` (`tls`/`ssl`), `SMTP_SENTFROM`, `SMTP_ADDADDRESS`
			
 
				+- Uses Ruby stdlib `net/smtp` — no gems required
			
 
				+- Sends multipart (plain + HTML) email with colour-coded summary table
			
 
				+- Silently skips if `SMTP_HOST` is not set
			
 
				 
			
 
				 ---
			
 
				 
			
@@ -134,7 +186,142 @@ The shared infrastructure (`Http`, `DB`, `enrich_after_upsert!`) handles everyth
 
				 - **`TABLE` constant conflicts**: Each scraper defines `TABLE = ENV.fetch("TABLE_NAME")` at the top level. If you `require` two scrapers in the same Ruby process you'll get a constant redefinition warning. Each scraper is designed to be run as a standalone script.
			
 
				 - **`COUNCIL_FILTER` / `COUNCIL_WHITELIST`**: The `docker-compose.yml` has a `COUNCIL_WHITELIST` env var that is passed to the scraper container but is not wired into `run_all.sh`. Use `ONLY` / `SKIP` in `run_all.sh` instead.
			
 
				 - **PlanBuild scrapers**: `planbuild.rb` handles councils on the state-run PlanBuild portal. It writes to per-council tables using `Util.ref_to_table`. These run alongside the council-specific scrapers.
			
 
				-- **PDF downloads**: Only happen when `DOWNLOAD_ATTACHMENTS=1`. Files land in `DOWNLOAD_DIR/<councilname>/`. The web portal serves them from `/downloads/` via an Apache alias.
			
 
				+- **PDF download path**: `local_document_url` must begin with `/files/` (not `/downloads/`). The Apache alias in `web/000-files.conf` is `Alias /files /srv/files`. Using `/downloads/` results in 404 in the web portal.
			
 
				+- **Binary PDF downloads**: Pass `headers: { "Accept" => "application/pdf,*/*", "Referer" => URL }` to `Http.get` when downloading PDFs from CDN subdomains — some CDNs reject requests without a valid referrer.
			
 
				 - **Non-ASCII in PDF URLs**: Some council sites embed Unicode characters (e.g. en-dash `–`) directly in PDF filenames. Always percent-encode hrefs before passing to `URI.join` — see `burnie.rb` `first_pdf_on_detail` for the pattern.
			
 
				 - **Redirect loops in `Net::HTTP.start` blocks**: `next` inside a `Net::HTTP.start` block exits the block, not the enclosing `while` loop. Use a `redirect_to` variable set inside the block and call `next` on the `while` loop after the block returns — see `burnie.rb` `http_get_with_cookies`.
			
 
				 - **Cloudflare JS challenge vs IP block**: A JS challenge (`"Just a moment"`) may work from a residential IP but always block from a datacenter/Docker IP. Detect it and exit cleanly. Sites confirmed blocked in Docker: `derwentvalley.tas.gov.au`, `latrobe.tas.gov.au`.
			
 
				+- **`group` column**: This is a reserved SQL word. In `DB.upsert` it is safe because all column names are backtick-quoted. In raw SQL always write `` `group` ``.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Next Phase — LLM-Based PDF Classification
			
 
				+
			
 
				+### Goal
			
 
				+
			
 
				+Extract structured information from downloaded DA PDFs using a local LLaMA model — primarily **application type** (Residential, Commercial, Industrial, Subdivision, etc.) but potentially other fields not reliably scraped from HTML (e.g. lot size, number of dwellings, value of works).
			
 
				+
			
 
				+### LLM Infrastructure
			
 
				+
			
 
				+A local Ollama instance is running at `http://192.168.8.73:11434` (env var: `LLAMA_URL`).
			
 
				+
			
 
				+`lib/llm.php` (already in the repo) shows the integration pattern for PHP:
			
 
				+
			
 
				+- Primary backend: llama-swap via OpenAI-compatible `/v1/chat/completions`
			
 
				+- Fallback: Ollama `/api/generate`
			
 
				+- Config loaded from `config/ai.php` — `LLAMACPP_HOST`, `OLLAMA_HOST`, `LLAMACPP_MODEL`, `OLLAMA_MODEL`, etc.
			
 
				+
			
 
				+For the Ruby scraper pipeline the equivalent is a direct Ollama HTTP call (no gems needed — stdlib `net/http`):
			
 
				+
			
 
				+```ruby
			
 
				+# Minimal Ollama call — POST to /api/generate
			
 
				+require "net/http"
			
 
				+require "json"
			
 
				+
			
 
				+def llm_classify(text, model: "llama3.2")
			
 
				+  uri  = URI("#{ENV.fetch('LLAMA_URL', 'http://192.168.8.73:11434')}/api/generate")
			
 
				+  body = JSON.generate(model: model, prompt: text, stream: false)
			
 
				+  res  = Net::HTTP.post(uri, body, "Content-Type" => "application/json")
			
 
				+  JSON.parse(res.body)["response"].to_s.strip
			
 
				+rescue StandardError => e
			
 
				+  warn "[llm] #{e.class}: #{e.message}"
			
 
				+  nil
			
 
				+end
			
 
				+```
			
 
				+
			
 
				+### Proposed Pipeline
			
 
				+
			
 
				+```text
			
 
				+Downloaded PDF (local_document_url)
			
 
				+    │
			
 
				+    ▼
			
 
				+Extract text (pdftotext CLI or pdf-reader gem)
			
 
				+    │
			
 
				+    ▼
			
 
				+Prompt LLM → application_type string
			
 
				+    │
			
 
				+    ▼
			
 
				+DB.upsert / UPDATE da_* SET application_type = ?
			
 
				+```
			
 
				+
			
 
				+### Suggested Prompt
			
 
				+
			
 
				+```text
			
 
				+You are classifying a Tasmanian planning development application.
			
 
				+Read the following text and return ONLY the single most appropriate
			
 
				+application type from this list:
			
 
				+  Residential, Commercial, Industrial, Subdivision, Rural/Agriculture,
			
 
				+  Tourism/Visitor Accommodation, Outbuilding/Shed, Change of Use,
			
 
				+  Demolition, Signage, Other
			
 
				+
			
 
				+Text:
			
 
				+<first 1500 characters of PDF text>
			
 
				+
			
 
				+Reply with the type only. No explanation.
			
 
				+```
			
 
				+
			
 
				+### Schema Changes Needed
			
 
				+
			
 
				+```sql
			
 
				+-- Add to ensure_table! and as a new migration:
			
 
				+application_type VARCHAR(60) NULL   -- e.g. "Residential", "Subdivision"
			
 
				+application_type_raw TEXT NULL      -- full LLM response for debugging
			
 
				+application_type_at DATETIME NULL   -- when classification was last run
			
 
				+```
			
 
				+
			
 
				+### Implementation Options
			
 
				+
			
 
				+**Option A — Inline during scrape** (simplest):
			
 
				+
			
 
				+- Each scraper that downloads PDFs calls `llm_classify` immediately after download
			
 
				+- Adds latency to each scrape run (LLM inference per PDF)
			
 
				+- Suitable if the LLM is fast (< 5s per classification)
			
 
				+
			
 
				+**Option B — Backfill tool** (recommended):
			
 
				+
			
 
				+- New script `tools/classify_pdfs.rb` — iterates rows where `local_document_url IS NOT NULL AND application_type IS NULL`
			
 
				+- Run separately from `run_all.sh`, on demand or on a cron
			
 
				+- Supports `ONLY_TABLE` env var to process one council at a time
			
 
				+- Safer — scrape failures don't block classification; can re-run without re-scraping
			
 
				+
			
 
				+**Option C — PHP tool in web container**:
			
 
				+
			
 
				+- New `tools/classify_pdfs.php` using the existing `lib/llm.php`
			
 
				+- Reads PDFs from `/srv/files`, calls `llmGenerate`, updates DB
			
 
				+- Advantage: reuses the already-written PHP LLM helper
			
 
				+- Disadvantage: PDF text extraction harder in PHP (needs `pdftotext` shell call or a PHP PDF lib)
			
 
				+
			
 
				+### PDF Text Extraction
			
 
				+
			
 
				+`pdftotext` (part of `poppler-utils`) is the most reliable option:
			
 
				+
			
 
				+```ruby
			
 
				+def extract_pdf_text(local_path, max_chars: 2000)
			
 
				+  # local_path is relative like "/files/northernmidlands/PLN-26-0030/doc.pdf"
			
 
				+  # Map to filesystem path inside container
			
 
				+  fs_path = local_path.sub(%r{\A/files/}, "#{ENV.fetch('DOWNLOAD_DIR', '/app/downloads')}/")
			
 
				+  return nil unless File.exist?(fs_path)
			
 
				+
			
 
				+  text, = Open3.capture2("pdftotext", "-l", "3", fs_path, "-")
			
 
				+  text.to_s.gsub(/\s+/, " ").strip[0, max_chars]
			
 
				+rescue StandardError => e
			
 
				+  warn "[classify] pdftotext failed for #{fs_path}: #{e.message}"
			
 
				+  nil
			
 
				+end
			
 
				+```
			
 
				+
			
 
				+`pdftotext` may need to be installed in the scraper Dockerfile:
			
 
				+
			
 
				+```dockerfile
			
 
				+RUN apt-get install -y poppler-utils
			
 
				+```
			
 
				+
			
 
				+### Key Decisions Before Implementation
			
 
				+
			
 
				+1. **Option A vs B vs C** — inline vs backfill tool vs PHP
			
 
				+2. **Which model** — any Ollama model on the local server (check with `curl http://192.168.8.73:11434/api/tags`)
			
 
				+3. **Prompt language** — zero-shot classification vs few-shot examples; JSON output vs plain text
			
 
				+4. **Confidence threshold** — store raw LLM response for auditing? Flag low-confidence results?
			
 
				+5. **Re-classification** — should existing `application_type` values be overwritten on re-run, or treated as write-once?
			
 
				+6. **Dockerfile change** — confirm `poppler-utils` can be added to the scraper image
			
 
				+
			
--- a/lib/llm.php
+++ b/lib/llm.php
@@ -0,0 +1,229 @@
 
				+<?php
			
 
				+/**
			
 
				+ * lib/llm.php
			
 
				+ *
			
 
				+ * Unified LLM inference helper.
			
 
				+ * Primary:  llama.cpp server (LLAMACPP_HOST) — /completion + /v1/embeddings
			
 
				+ * Fallback: Ollama            (OLLAMA_HOST)   — /api/generate + /api/embed
			
 
				+ *
			
 
				+ * Primary backend is llama-swap, which proxies multiple llama.cpp instances
			
 
				+ * and routes by model name via the OpenAI-compatible /v1 API.
			
 
				+ *
			
 
				+ * Public API:
			
 
				+ *   llmGenerate(string $prompt, array $options = []): string
			
 
				+ *   llmEmbed(string $text): ?array
			
 
				+ *
			
 
				+ * $options keys (all optional):
			
 
				+ *   temperature    float  default LLAMACPP_TEMPERATURE
			
 
				+ *   num_predict    int    default 2048  (maps to max_tokens)
			
 
				+ *   num_ctx        int    default 6144  (Ollama only — ignored by llama-swap)
			
 
				+ *   repeat_penalty float  default 1.1  (maps to frequency_penalty approx)
			
 
				+ */
			
 
				+
			
 
				+require_once __DIR__ . '/../config/ai.php';
			
 
				+
			
 
				+// ── Public functions ──────────────────────────────────────────────────────────
			
 
				+
			
 
				+/**
			
 
				+ * Generate text from a prompt.
			
 
				+ * Tries llama.cpp first; falls back to Ollama on connection failure or non-200.
			
 
				+ *
			
 
				+ * @throws RuntimeException when both backends fail
			
 
				+ */
			
 
				+function llmGenerate(string $prompt, array $options = []): string
			
 
				+{
			
 
				+    $text = _llamacppGenerate($prompt, $options);
			
 
				+    if ($text !== null) {
			
 
				+        return $text;
			
 
				+    }
			
 
				+
			
 
				+    error_log('[llm] llama.cpp unavailable — falling back to Ollama');
			
 
				+
			
 
				+    $text = _ollamaGenerate($prompt, $options);
			
 
				+    if ($text !== null) {
			
 
				+        return $text;
			
 
				+    }
			
 
				+
			
 
				+    throw new RuntimeException('All LLM backends unavailable');
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Embed text into a float vector.
			
 
				+ * Tries llama.cpp /v1/embeddings first; falls back to Ollama /api/embed.
			
 
				+ * Returns null only when both backends fail.
			
 
				+ */
			
 
				+function llmEmbed(string $text): ?array
			
 
				+{
			
 
				+    $text = substr($text, 0, 2000);
			
 
				+
			
 
				+    // Only try llama-swap for embeddings if an embed model is configured
			
 
				+    if (LLAMACPP_EMBED_MODEL !== '') {
			
 
				+        $emb = _llamacppEmbed($text);
			
 
				+        if ($emb !== null) {
			
 
				+            return $emb;
			
 
				+        }
			
 
				+        error_log('[llm] llama-swap embed unavailable — falling back to Ollama');
			
 
				+    }
			
 
				+
			
 
				+    return _ollamaEmbed($text);
			
 
				+}
			
 
				+
			
 
				+// ── llama.cpp backend ─────────────────────────────────────────────────────────
			
 
				+
			
 
				+function _llamacppGenerate(string $prompt, array $options): ?string
			
 
				+{
			
 
				+    // llama-swap uses the OpenAI chat completions endpoint, routed by model name
			
 
				+    $payload = json_encode([
			
 
				+        'model'       => LLAMACPP_MODEL,
			
 
				+        'messages'    => [['role' => 'user', 'content' => $prompt]],
			
 
				+        'max_tokens'  => $options['num_predict']    ?? 2048,
			
 
				+        'temperature' => $options['temperature']    ?? LLAMACPP_TEMPERATURE,
			
 
				+        'top_p'       => $options['top_p']          ?? LLAMACPP_TOP_P,
			
 
				+        'top_k'       => $options['top_k']          ?? LLAMACPP_TOP_K,
			
 
				+        'repeat_penalty' => $options['repeat_penalty'] ?? 1.1,
			
 
				+        'stop'        => $options['stop']           ?? [],
			
 
				+        'stream'      => false,
			
 
				+    ]);
			
 
				+
			
 
				+    $ch = curl_init(LLAMACPP_HOST . '/v1/chat/completions');
			
 
				+    curl_setopt_array($ch, [
			
 
				+        CURLOPT_POST           => true,
			
 
				+        CURLOPT_POSTFIELDS     => $payload,
			
 
				+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
			
 
				+        CURLOPT_RETURNTRANSFER => true,
			
 
				+        CURLOPT_TIMEOUT        => LLAMACPP_TIMEOUT,
			
 
				+        CURLOPT_CONNECTTIMEOUT => 3,
			
 
				+    ]);
			
 
				+
			
 
				+    $resp = curl_exec($ch);
			
 
				+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
			
 
				+    $err  = curl_error($ch);
			
 
				+    curl_close($ch);
			
 
				+
			
 
				+    if ($err || $resp === false || $code !== 200) {
			
 
				+        error_log('[llm] llama-swap generate: ' . ($err ?: "HTTP $code"));
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    $data = json_decode($resp, true);
			
 
				+    $text = trim($data['choices'][0]['message']['content'] ?? '');
			
 
				+    return $text !== '' ? $text : null;
			
 
				+}
			
 
				+
			
 
				+function _llamacppEmbed(string $text): ?array
			
 
				+{
			
 
				+    // llama-swap routes embeddings by model name, same as completions
			
 
				+    $payload = json_encode(['model' => LLAMACPP_EMBED_MODEL, 'input' => $text]);
			
 
				+
			
 
				+    $ch = curl_init(LLAMACPP_HOST . '/v1/embeddings');
			
 
				+    curl_setopt_array($ch, [
			
 
				+        CURLOPT_POST           => true,
			
 
				+        CURLOPT_POSTFIELDS     => $payload,
			
 
				+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
			
 
				+        CURLOPT_RETURNTRANSFER => true,
			
 
				+        CURLOPT_TIMEOUT        => 15,
			
 
				+        CURLOPT_CONNECTTIMEOUT => 3,
			
 
				+    ]);
			
 
				+
			
 
				+    $resp = curl_exec($ch);
			
 
				+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
			
 
				+    $err  = curl_error($ch);
			
 
				+    curl_close($ch);
			
 
				+
			
 
				+    if ($err || $resp === false || $code !== 200) {
			
 
				+        error_log('[llm] llama.cpp embed: ' . ($err ?: "HTTP $code"));
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    $data = json_decode($resp, true);
			
 
				+    $emb  = $data['data'][0]['embedding'] ?? null;
			
 
				+    return (is_array($emb) && count($emb) > 0) ? $emb : null;
			
 
				+}
			
 
				+
			
 
				+// ── Ollama backend ────────────────────────────────────────────────────────────
			
 
				+
			
 
				+function _ollamaGenerate(string $prompt, array $options): ?string
			
 
				+{
			
 
				+    $payload = json_encode([
			
 
				+        'model'  => OLLAMA_MODEL,
			
 
				+        'prompt' => $prompt,
			
 
				+        'stream' => false,
			
 
				+        'options' => [
			
 
				+            'temperature'    => $options['temperature']    ?? 0.3,
			
 
				+            'num_predict'    => $options['num_predict']    ?? 2048,
			
 
				+            'num_ctx'        => $options['num_ctx']        ?? 6144,
			
 
				+            'repeat_penalty' => $options['repeat_penalty'] ?? 1.1,
			
 
				+            'keep_alive'     => -1,
			
 
				+        ],
			
 
				+    ]);
			
 
				+
			
 
				+    $ch = curl_init(OLLAMA_HOST . '/api/generate');
			
 
				+    curl_setopt_array($ch, [
			
 
				+        CURLOPT_POST           => true,
			
 
				+        CURLOPT_POSTFIELDS     => $payload,
			
 
				+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
			
 
				+        CURLOPT_RETURNTRANSFER => true,
			
 
				+        CURLOPT_TIMEOUT        => OLLAMA_TIMEOUT,
			
 
				+        CURLOPT_CONNECTTIMEOUT => 5,
			
 
				+    ]);
			
 
				+
			
 
				+    $resp = curl_exec($ch);
			
 
				+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
			
 
				+    $err  = curl_error($ch);
			
 
				+    curl_close($ch);
			
 
				+
			
 
				+    if ($err || $resp === false || $code !== 200) {
			
 
				+        error_log('[llm] Ollama generate: ' . ($err ?: "HTTP $code"));
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    $data = json_decode($resp, true);
			
 
				+    $text = trim($data['response'] ?? '');
			
 
				+    return $text !== '' ? $text : null;
			
 
				+}
			
 
				+
			
 
				+function _ollamaEmbed(string $text): ?array
			
 
				+{
			
 
				+    // Try /api/embed (Ollama >= 0.1.26) first
			
 
				+    $ch = curl_init(OLLAMA_HOST . '/api/embed');
			
 
				+    curl_setopt_array($ch, [
			
 
				+        CURLOPT_POST           => true,
			
 
				+        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'input' => $text]),
			
 
				+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
			
 
				+        CURLOPT_RETURNTRANSFER => true,
			
 
				+        CURLOPT_TIMEOUT        => 15,
			
 
				+        CURLOPT_CONNECTTIMEOUT => 5,
			
 
				+    ]);
			
 
				+    $resp = curl_exec($ch);
			
 
				+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
			
 
				+    curl_close($ch);
			
 
				+
			
 
				+    if ($resp && $code === 200) {
			
 
				+        $data = json_decode($resp, true);
			
 
				+        $emb  = $data['embeddings'][0] ?? null;
			
 
				+        if (is_array($emb) && count($emb) > 0) return $emb;
			
 
				+    }
			
 
				+
			
 
				+    // Fallback: legacy /api/embeddings
			
 
				+    $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
			
 
				+    curl_setopt_array($ch, [
			
 
				+        CURLOPT_POST           => true,
			
 
				+        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'prompt' => $text]),
			
 
				+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
			
 
				+        CURLOPT_RETURNTRANSFER => true,
			
 
				+        CURLOPT_TIMEOUT        => 15,
			
 
				+        CURLOPT_CONNECTTIMEOUT => 5,
			
 
				+    ]);
			
 
				+    $resp2 = curl_exec($ch);
			
 
				+    $code2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
			
 
				+    curl_close($ch);
			
 
				+
			
 
				+    if ($resp2 && $code2 === 200) {
			
 
				+        $data2 = json_decode($resp2, true);
			
 
				+        $emb2  = $data2['embedding'] ?? null;
			
 
				+        if (is_array($emb2) && count($emb2) > 0) return $emb2;
			
 
				+    }
			
 
				+
			
 
				+    error_log('[llm] All embed backends failed');
			
 
				+    return null;
			
 
				+}