Răsfoiți Sursa

ingest updates

Benjamin Harris 2 luni în urmă
părinte
comite
72aa22aa0d
2 a modificat fișierele cu 137 adăugiri și 33 ștergeri
  1. 30 13
      controllers/ollamaGenerate.php
  2. 107 20
      tools/ingest_knowledge.php

+ 30 - 13
controllers/ollamaGenerate.php

@@ -415,37 +415,54 @@ function retrieveRelevantChunks(PDO $pdo, string $queryText, string $section, in
 }
 
 /**
- * Embed text via Ollama /api/embeddings. Returns float[] or null.
+ * Embed text via Ollama. Tries new /api/embed first, falls back to legacy
+ * /api/embeddings. Returns float[] or null on failure.
  */
 function getQueryEmbedding(string $text): ?array
 {
-    // Use a shorter representative string for the query (first 2000 chars)
     $queryText = substr($text, 0, 2000);
 
-    $payload = json_encode([
-        'model'  => EMBED_MODEL,
-        'prompt' => $queryText,
+    // ── New API (/api/embed, Ollama >= 0.1.26) ───────────────────────────────
+    $ch = curl_init(OLLAMA_HOST . '/api/embed');
+    curl_setopt_array($ch, [
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'input' => $queryText]),
+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_TIMEOUT        => 15,
+        CURLOPT_CONNECTTIMEOUT => 3,
     ]);
+    $resp = curl_exec($ch);
+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+    curl_close($ch);
+
+    if ($resp && $code === 200) {
+        $data = json_decode($resp, true);
+        $emb  = $data['embeddings'][0] ?? null;
+        if (is_array($emb) && count($emb) > 0) return $emb;
+    }
 
+    // ── Legacy API (/api/embeddings) ─────────────────────────────────────────
     $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
     curl_setopt_array($ch, [
         CURLOPT_POST           => true,
-        CURLOPT_POSTFIELDS     => $payload,
+        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'prompt' => $queryText]),
         CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
         CURLOPT_RETURNTRANSFER => true,
         CURLOPT_TIMEOUT        => 15,
         CURLOPT_CONNECTTIMEOUT => 3,
     ]);
-
-    $response = curl_exec($ch);
-    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+    $resp2 = curl_exec($ch);
+    $code2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
     curl_close($ch);
 
-    if (!$response || $httpCode !== 200) return null;
+    if ($resp2 && $code2 === 200) {
+        $data2 = json_decode($resp2, true);
+        $emb2  = $data2['embedding'] ?? null;
+        if (is_array($emb2) && count($emb2) > 0) return $emb2;
+    }
 
-    $data = json_decode($response, true);
-    $emb  = $data['embedding'] ?? null;
-    return (is_array($emb) && count($emb) > 0) ? $emb : null;
+    return null;
 }
 
 /**

+ 107 - 20
tools/ingest_knowledge.php

@@ -33,21 +33,22 @@ require ROOT . '/config/database.php';
 use Smalot\PdfParser\Parser;
 
 // ── Config ───────────────────────────────────────────────────────────────────
-define('OLLAMA_EMBED_URL', 'http://192.168.8.73:11434/api/embeddings');
-define('EMBED_MODEL',      'nomic-embed-text');
-define('CHUNK_WORDS',      500);   // target words per chunk
-define('OVERLAP_WORDS',    80);    // overlap between consecutive chunks
+define('OLLAMA_HOST',  'http://192.168.8.73:11434');
+define('EMBED_MODEL',  'nomic-embed-text');
+define('CHUNK_WORDS',  500);   // target words per chunk
+define('OVERLAP_WORDS', 80);   // overlap between consecutive chunks
 
 // ── Parse args ───────────────────────────────────────────────────────────────
-$opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'help']);
+$opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help']);
 
-if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']))) {
+if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
     echo <<<HELP
 Usage:
   php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
   php tools/ingest_knowledge.php --dir="books/"    --author="Various"
   php tools/ingest_knowledge.php --list
   php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
+  php tools/ingest_knowledge.php --test            (verify Ollama connection + embedding)
 
 Options:
   --file    Path to a single PDF file
@@ -62,6 +63,56 @@ HELP;
 
 $pdo = getDBConnection();
 
+// ── Test mode ─────────────────────────────────────────────────────────────────
+if (isset($opts['test'])) {
+    echo "Testing Ollama connection at " . OLLAMA_HOST . " ...\n\n";
+
+    // 1. List available models
+    $ch = curl_init(OLLAMA_HOST . '/api/tags');
+    curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 5]);
+    $resp = curl_exec($ch);
+    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+    $err  = curl_error($ch);
+    curl_close($ch);
+
+    if ($err || $code !== 200) {
+        echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n";
+        exit(1);
+    }
+    $models = json_decode($resp, true);
+    $names  = array_column($models['models'] ?? [], 'name');
+    echo "OK: Ollama reachable. Models installed:\n";
+    foreach ($names as $name) echo "  - $name\n";
+
+    $embedFound = false;
+    foreach ($names as $n) {
+        if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; }
+    }
+    if (!$embedFound) {
+        echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n";
+        echo "Run on your Ollama server:  ollama pull " . EMBED_MODEL . "\n\n";
+    }
+
+    // 2. Test embedding
+    echo "\nTesting embedding endpoint ...\n";
+    [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence");
+
+    echo "HTTP code:   $httpCode\n";
+    echo "API used:    $apiUsed\n";
+    if ($embedding !== null) {
+        echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n";
+        echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n";
+    } else {
+        echo "FAIL: No embedding returned.\n";
+        echo "Raw response: $rawResp\n";
+        echo "\nPossible fixes:\n";
+        echo "  1. Run: ollama pull " . EMBED_MODEL . "\n";
+        echo "  2. Check Ollama version: ollama --version (need >= 0.1.20)\n";
+        echo "  3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n";
+    }
+    exit(0);
+}
+
 // ── List mode ────────────────────────────────────────────────────────────────
 if (isset($opts['list'])) {
     $stmt = $pdo->query(
@@ -259,16 +310,27 @@ function cleanText(string $text): string
 }
 
 /**
- * Call Ollama's /api/embeddings and return float[] or null on failure.
+ * Call Ollama to embed text. Tries the newer /api/embed endpoint first
+ * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
+ * then falls back to the legacy /api/embeddings (uses "prompt" key,
+ * returns "embedding" array). Returns float[] or null on failure.
  */
 function getEmbedding(string $text): ?array
 {
-    $payload = json_encode([
-        'model'  => EMBED_MODEL,
-        'prompt' => $text,
-    ]);
+    [$embedding] = getEmbeddingDebug($text);
+    return $embedding;
+}
+
+/**
+ * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode]
+ * for diagnostic output.
+ */
+function getEmbeddingDebug(string $text): array
+{
+    // ── Try new API: POST /api/embed  {"model":..., "input":...} ────────────
+    $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]);
 
-    $ch = curl_init(OLLAMA_EMBED_URL);
+    $ch = curl_init(OLLAMA_HOST . '/api/embed');
     curl_setopt_array($ch, [
         CURLOPT_POST           => true,
         CURLOPT_POSTFIELDS     => $payload,
@@ -277,21 +339,46 @@ function getEmbedding(string $text): ?array
         CURLOPT_TIMEOUT        => 30,
         CURLOPT_CONNECTTIMEOUT => 5,
     ]);
-
     $response = curl_exec($ch);
     $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
     curl_close($ch);
 
-    if (!$response || $httpCode !== 200) {
-        return null;
+    if ($response && $httpCode === 200) {
+        $data = json_decode($response, true);
+        // New API returns { "embeddings": [[...]] }
+        $emb = $data['embeddings'][0] ?? null;
+        if (is_array($emb) && count($emb) > 0) {
+            return [$emb, '/api/embed (new)', $response, $httpCode];
+        }
     }
 
-    $data = json_decode($response, true);
-    $embedding = $data['embedding'] ?? null;
+    // ── Fallback: legacy /api/embeddings  {"model":..., "prompt":...} ───────
+    $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]);
 
-    if (!is_array($embedding) || count($embedding) === 0) {
-        return null;
+    $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
+    curl_setopt_array($ch, [
+        CURLOPT_POST           => true,
+        CURLOPT_POSTFIELDS     => $payload,
+        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
+        CURLOPT_RETURNTRANSFER => true,
+        CURLOPT_TIMEOUT        => 30,
+        CURLOPT_CONNECTTIMEOUT => 5,
+    ]);
+    $response2 = curl_exec($ch);
+    $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+    curl_close($ch);
+
+    if ($response2 && $httpCode2 === 200) {
+        $data2 = json_decode($response2, true);
+        // Legacy API returns { "embedding": [...] }
+        $emb2 = $data2['embedding'] ?? null;
+        if (is_array($emb2) && count($emb2) > 0) {
+            return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2];
+        }
     }
 
-    return $embedding;
+    // Return last response for diagnostics
+    $lastResp = $response2 ?: $response ?: '';
+    $lastCode = $httpCode2 ?: $httpCode;
+    return [null, 'both failed', $lastResp, $lastCode];
 }