|
@@ -33,21 +33,22 @@ require ROOT . '/config/database.php';
|
|
|
use Smalot\PdfParser\Parser;
|
|
use Smalot\PdfParser\Parser;
|
|
|
|
|
|
|
|
// ── Config ───────────────────────────────────────────────────────────────────
|
|
// ── Config ───────────────────────────────────────────────────────────────────
|
|
|
-define('OLLAMA_EMBED_URL', 'http://192.168.8.73:11434/api/embeddings');
|
|
|
|
|
-define('EMBED_MODEL', 'nomic-embed-text');
|
|
|
|
|
-define('CHUNK_WORDS', 500); // target words per chunk
|
|
|
|
|
-define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
|
|
|
|
|
|
|
+define('OLLAMA_HOST', 'http://192.168.8.73:11434');
|
|
|
|
|
+define('EMBED_MODEL', 'nomic-embed-text');
|
|
|
|
|
+define('CHUNK_WORDS', 500); // target words per chunk
|
|
|
|
|
+define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
|
|
|
|
|
|
|
|
// ── Parse args ───────────────────────────────────────────────────────────────
|
|
// ── Parse args ───────────────────────────────────────────────────────────────
|
|
|
-$opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'help']);
|
|
|
|
|
|
|
+$opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help']);
|
|
|
|
|
|
|
|
-if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']))) {
|
|
|
|
|
|
|
+if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
|
|
|
echo <<<HELP
|
|
echo <<<HELP
|
|
|
Usage:
|
|
Usage:
|
|
|
php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
|
|
php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
|
|
|
php tools/ingest_knowledge.php --dir="books/" --author="Various"
|
|
php tools/ingest_knowledge.php --dir="books/" --author="Various"
|
|
|
php tools/ingest_knowledge.php --list
|
|
php tools/ingest_knowledge.php --list
|
|
|
php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
|
|
php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
|
|
|
|
|
+ php tools/ingest_knowledge.php --test (verify Ollama connection + embedding)
|
|
|
|
|
|
|
|
Options:
|
|
Options:
|
|
|
--file Path to a single PDF file
|
|
--file Path to a single PDF file
|
|
@@ -62,6 +63,56 @@ HELP;
|
|
|
|
|
|
|
|
$pdo = getDBConnection();
|
|
$pdo = getDBConnection();
|
|
|
|
|
|
|
|
|
|
+// ── Test mode ─────────────────────────────────────────────────────────────────
|
|
|
|
|
+if (isset($opts['test'])) {
|
|
|
|
|
+ echo "Testing Ollama connection at " . OLLAMA_HOST . " ...\n\n";
|
|
|
|
|
+
|
|
|
|
|
+ // 1. List available models
|
|
|
|
|
+ $ch = curl_init(OLLAMA_HOST . '/api/tags');
|
|
|
|
|
+ curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 5]);
|
|
|
|
|
+ $resp = curl_exec($ch);
|
|
|
|
|
+ $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
|
|
|
+ $err = curl_error($ch);
|
|
|
|
|
+ curl_close($ch);
|
|
|
|
|
+
|
|
|
|
|
+ if ($err || $code !== 200) {
|
|
|
|
|
+ echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n";
|
|
|
|
|
+ exit(1);
|
|
|
|
|
+ }
|
|
|
|
|
+ $models = json_decode($resp, true);
|
|
|
|
|
+ $names = array_column($models['models'] ?? [], 'name');
|
|
|
|
|
+ echo "OK: Ollama reachable. Models installed:\n";
|
|
|
|
|
+ foreach ($names as $name) echo " - $name\n";
|
|
|
|
|
+
|
|
|
|
|
+ $embedFound = false;
|
|
|
|
|
+ foreach ($names as $n) {
|
|
|
|
|
+ if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (!$embedFound) {
|
|
|
|
|
+ echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n";
|
|
|
|
|
+ echo "Run on your Ollama server: ollama pull " . EMBED_MODEL . "\n\n";
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 2. Test embedding
|
|
|
|
|
+ echo "\nTesting embedding endpoint ...\n";
|
|
|
|
|
+ [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence");
|
|
|
|
|
+
|
|
|
|
|
+ echo "HTTP code: $httpCode\n";
|
|
|
|
|
+ echo "API used: $apiUsed\n";
|
|
|
|
|
+ if ($embedding !== null) {
|
|
|
|
|
+ echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n";
|
|
|
|
|
+ echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n";
|
|
|
|
|
+ } else {
|
|
|
|
|
+ echo "FAIL: No embedding returned.\n";
|
|
|
|
|
+ echo "Raw response: $rawResp\n";
|
|
|
|
|
+ echo "\nPossible fixes:\n";
|
|
|
|
|
+ echo " 1. Run: ollama pull " . EMBED_MODEL . "\n";
|
|
|
|
|
+ echo " 2. Check Ollama version: ollama --version (need >= 0.1.20)\n";
|
|
|
|
|
+ echo " 3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n";
|
|
|
|
|
+ }
|
|
|
|
|
+ exit(0);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
// ── List mode ────────────────────────────────────────────────────────────────
|
|
// ── List mode ────────────────────────────────────────────────────────────────
|
|
|
if (isset($opts['list'])) {
|
|
if (isset($opts['list'])) {
|
|
|
$stmt = $pdo->query(
|
|
$stmt = $pdo->query(
|
|
@@ -259,16 +310,27 @@ function cleanText(string $text): string
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Call Ollama's /api/embeddings and return float[] or null on failure.
|
|
|
|
|
|
|
+ * Call Ollama to embed text. Tries the newer /api/embed endpoint first
|
|
|
|
|
+ * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
|
|
|
|
|
+ * then falls back to the legacy /api/embeddings (uses "prompt" key,
|
|
|
|
|
+ * returns "embedding" array). Returns float[] or null on failure.
|
|
|
*/
|
|
*/
|
|
|
function getEmbedding(string $text): ?array
|
|
function getEmbedding(string $text): ?array
|
|
|
{
|
|
{
|
|
|
- $payload = json_encode([
|
|
|
|
|
- 'model' => EMBED_MODEL,
|
|
|
|
|
- 'prompt' => $text,
|
|
|
|
|
- ]);
|
|
|
|
|
|
|
+ [$embedding] = getEmbeddingDebug($text);
|
|
|
|
|
+ return $embedding;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode]
|
|
|
|
|
+ * for diagnostic output.
|
|
|
|
|
+ */
|
|
|
|
|
+function getEmbeddingDebug(string $text): array
|
|
|
|
|
+{
|
|
|
|
|
+ // ── Try new API: POST /api/embed {"model":..., "input":...} ────────────
|
|
|
|
|
+ $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]);
|
|
|
|
|
|
|
|
- $ch = curl_init(OLLAMA_EMBED_URL);
|
|
|
|
|
|
|
+ $ch = curl_init(OLLAMA_HOST . '/api/embed');
|
|
|
curl_setopt_array($ch, [
|
|
curl_setopt_array($ch, [
|
|
|
CURLOPT_POST => true,
|
|
CURLOPT_POST => true,
|
|
|
CURLOPT_POSTFIELDS => $payload,
|
|
CURLOPT_POSTFIELDS => $payload,
|
|
@@ -277,21 +339,46 @@ function getEmbedding(string $text): ?array
|
|
|
CURLOPT_TIMEOUT => 30,
|
|
CURLOPT_TIMEOUT => 30,
|
|
|
CURLOPT_CONNECTTIMEOUT => 5,
|
|
CURLOPT_CONNECTTIMEOUT => 5,
|
|
|
]);
|
|
]);
|
|
|
-
|
|
|
|
|
$response = curl_exec($ch);
|
|
$response = curl_exec($ch);
|
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
|
curl_close($ch);
|
|
curl_close($ch);
|
|
|
|
|
|
|
|
- if (!$response || $httpCode !== 200) {
|
|
|
|
|
- return null;
|
|
|
|
|
|
|
+ if ($response && $httpCode === 200) {
|
|
|
|
|
+ $data = json_decode($response, true);
|
|
|
|
|
+ // New API returns { "embeddings": [[...]] }
|
|
|
|
|
+ $emb = $data['embeddings'][0] ?? null;
|
|
|
|
|
+ if (is_array($emb) && count($emb) > 0) {
|
|
|
|
|
+ return [$emb, '/api/embed (new)', $response, $httpCode];
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- $data = json_decode($response, true);
|
|
|
|
|
- $embedding = $data['embedding'] ?? null;
|
|
|
|
|
|
|
+ // ── Fallback: legacy /api/embeddings {"model":..., "prompt":...} ───────
|
|
|
|
|
+ $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]);
|
|
|
|
|
|
|
|
- if (!is_array($embedding) || count($embedding) === 0) {
|
|
|
|
|
- return null;
|
|
|
|
|
|
|
+ $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
|
|
|
|
|
+ curl_setopt_array($ch, [
|
|
|
|
|
+ CURLOPT_POST => true,
|
|
|
|
|
+ CURLOPT_POSTFIELDS => $payload,
|
|
|
|
|
+ CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
|
|
|
|
|
+ CURLOPT_RETURNTRANSFER => true,
|
|
|
|
|
+ CURLOPT_TIMEOUT => 30,
|
|
|
|
|
+ CURLOPT_CONNECTTIMEOUT => 5,
|
|
|
|
|
+ ]);
|
|
|
|
|
+ $response2 = curl_exec($ch);
|
|
|
|
|
+ $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
|
|
|
+ curl_close($ch);
|
|
|
|
|
+
|
|
|
|
|
+ if ($response2 && $httpCode2 === 200) {
|
|
|
|
|
+ $data2 = json_decode($response2, true);
|
|
|
|
|
+ // Legacy API returns { "embedding": [...] }
|
|
|
|
|
+ $emb2 = $data2['embedding'] ?? null;
|
|
|
|
|
+ if (is_array($emb2) && count($emb2) > 0) {
|
|
|
|
|
+ return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2];
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- return $embedding;
|
|
|
|
|
|
|
+ // Return last response for diagnostics
|
|
|
|
|
+ $lastResp = $response2 ?: $response ?: '';
|
|
|
|
|
+ $lastCode = $httpCode2 ?: $httpCode;
|
|
|
|
|
+ return [null, 'both failed', $lastResp, $lastCode];
|
|
|
}
|
|
}
|