true, CURLOPT_TIMEOUT => 5]); $resp = curl_exec($ch); $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); $err = curl_error($ch); curl_close($ch); if ($err || $code !== 200) { echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n"; exit(1); } $models = json_decode($resp, true); $names = array_column($models['models'] ?? [], 'name'); echo "OK: Ollama reachable. Models installed:\n"; foreach ($names as $name) echo " - $name\n"; $embedFound = false; foreach ($names as $n) { if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; } } if (!$embedFound) { echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n"; echo "Run on your Ollama server: ollama pull " . EMBED_MODEL . "\n\n"; } // 2. Test embedding echo "\nTesting embedding endpoint ...\n"; [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence"); echo "HTTP code: $httpCode\n"; echo "API used: $apiUsed\n"; if ($embedding !== null) { echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n"; echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n"; } else { echo "FAIL: No embedding returned.\n"; echo "Raw response: $rawResp\n"; echo "\nPossible fixes:\n"; echo " 1. Run: ollama pull " . EMBED_MODEL . "\n"; echo " 2. Check Ollama version: ollama --version (need >= 0.1.20)\n"; echo " 3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n"; } exit(0); } // ── List mode ──────────────────────────────────────────────────────────────── if (isset($opts['list'])) { $stmt = $pdo->query( "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at FROM knowledge_chunks GROUP BY source, author ORDER BY source" ); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); if (!$rows) { echo "No sources indexed yet.\n"; } else { printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed'); echo str_repeat('-', 100) . "\n"; foreach ($rows as $r) { printf("%-55s %-25s %6d %s\n", substr($r['source'], 0, 54), substr($r['author'], 0, 24), $r['chunks'], $r['indexed_at'] ); } } exit(0); } // ── Clear mode ─────────────────────────────────────────────────────────────── if (!empty($opts['clear'])) { $title = $opts['clear']; $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $stmt->execute([$title]); $count = (int)$stmt->fetchColumn(); if ($count === 0) { echo "No chunks found for source: $title\n"; exit(0); } $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?'); $del->execute([$title]); echo "Deleted $count chunks for: $title\n"; exit(0); } // ── Collect PDF files ──────────────────────────────────────────────────────── $files = []; $author = trim($opts['author'] ?? 'Unknown'); if (!empty($opts['file'])) { $path = $opts['file']; if (!is_file($path)) { die("File not found: $path\n"); } $files[] = $path; } if (!empty($opts['dir'])) { $dir = rtrim($opts['dir'], '/\\'); if (!is_dir($dir)) { die("Directory not found: $dir\n"); } $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir)); foreach ($it as $f) { if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') { $files[] = $f->getPathname(); } } if (!$files) { die("No PDF files found in: $dir\n"); } } echo "Found " . count($files) . " PDF file(s) to ingest.\n\n"; // ── Process each file ──────────────────────────────────────────────────────── $parser = new Parser(); foreach ($files as $filePath) { $source = pathinfo($filePath, PATHINFO_FILENAME); echo "Processing: $source\n"; // Check if already indexed $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $chk->execute([$source]); if ((int)$chk->fetchColumn() > 0) { echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n"; continue; } try { $pdf = $parser->parseFile($filePath); $pages = $pdf->getPages(); } catch (Exception $e) { echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n"; continue; } echo " Pages: " . count($pages) . "\n"; $totalChunks = 0; $totalTokens = 0; $pageBuffer = []; // accumulate pages into a rolling word buffer $insertStmt = $pdo->prepare( 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding) VALUES (?, ?, ?, ?, ?, ?)' ); $chunkIndex = 0; $wordBuffer = []; $bufferPages = []; // page numbers corresponding to words in buffer foreach ($pages as $pageNum => $page) { $pageText = cleanText($page->getText()); if (strlen($pageText) < 50) continue; // skip blank/image-only pages $words = explode(' ', $pageText); foreach ($words as $word) { $wordBuffer[] = $word; $bufferPages[] = $pageNum + 1; } // Flush when buffer reaches chunk size while (count($wordBuffer) >= CHUNK_WORDS) { $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS); $chunkText = implode(' ', $chunkWords); $chunkPage = $bufferPages[0]; if (strlen(trim($chunkText)) > 50) { $embedding = getEmbedding($chunkText); if ($embedding === null) { echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n"; } else { $insertStmt->execute([ $source, $author, $chunkPage, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } // Slide window with overlap $step = CHUNK_WORDS - OVERLAP_WORDS; $wordBuffer = array_slice($wordBuffer, $step); $bufferPages = array_slice($bufferPages, $step); if ($chunkIndex % 20 === 0 && $chunkIndex > 0) { echo " ...{$chunkIndex} chunks embedded\n"; } } } // Flush remaining words as final chunk if (count($wordBuffer) > 30) { $chunkText = implode(' ', $wordBuffer); $embedding = getEmbedding($chunkText); if ($embedding !== null) { $insertStmt->execute([ $source, $author, $bufferPages[0] ?? 0, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } echo " Done: $totalChunks chunks stored.\n\n"; } echo "Ingestion complete.\n"; exit(0); // ── Helpers ────────────────────────────────────────────────────────────────── /** * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc. */ function cleanText(string $text): string { // Common PDF ligature replacements $ligatures = [ 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff', 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st', ]; $text = strtr($text, $ligatures); // Collapse multiple spaces / newlines into single space $text = preg_replace('/\s+/', ' ', $text); // Remove non-printable characters except newlines $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text); return trim($text); } /** * Call Ollama to embed text. Tries the newer /api/embed endpoint first * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array), * then falls back to the legacy /api/embeddings (uses "prompt" key, * returns "embedding" array). Returns float[] or null on failure. */ function getEmbedding(string $text): ?array { [$embedding] = getEmbeddingDebug($text); return $embedding; } /** * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode] * for diagnostic output. */ function getEmbeddingDebug(string $text): array { // ── Try new API: POST /api/embed {"model":..., "input":...} ──────────── $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]); $ch = curl_init(OLLAMA_HOST . '/api/embed'); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 5, ]); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($response && $httpCode === 200) { $data = json_decode($response, true); // New API returns { "embeddings": [[...]] } $emb = $data['embeddings'][0] ?? null; if (is_array($emb) && count($emb) > 0) { return [$emb, '/api/embed (new)', $response, $httpCode]; } } // ── Fallback: legacy /api/embeddings {"model":..., "prompt":...} ─────── $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]); $ch = curl_init(OLLAMA_HOST . '/api/embeddings'); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 5, ]); $response2 = curl_exec($ch); $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($response2 && $httpCode2 === 200) { $data2 = json_decode($response2, true); // Legacy API returns { "embedding": [...] } $emb2 = $data2['embedding'] ?? null; if (is_array($emb2) && count($emb2) > 0) { return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2]; } } // Return last response for diagnostics $lastResp = $response2 ?: $response ?: ''; $lastCode = $httpCode2 ?: $httpCode; return [null, 'both failed', $lastResp, $lastCode]; }