true, CURLOPT_TIMEOUT => 5]); $resp = curl_exec($ch); $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); $err = curl_error($ch); curl_close($ch); if ($err || $code !== 200) { echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n"; exit(1); } $models = json_decode($resp, true); $names = array_column($models['models'] ?? [], 'name'); echo "OK: Ollama reachable. Models installed:\n"; foreach ($names as $name) echo " - $name\n"; $embedFound = false; foreach ($names as $n) { if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; } } if (!$embedFound) { echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n"; echo "Run on your Ollama server: ollama pull " . EMBED_MODEL . "\n\n"; } // 2. Test embedding echo "\nTesting embedding endpoint ...\n"; [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence"); echo "HTTP code: $httpCode\n"; echo "API used: $apiUsed\n"; if ($embedding !== null) { echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n"; echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n"; } else { echo "FAIL: No embedding returned.\n"; echo "Raw response: $rawResp\n"; echo "\nPossible fixes:\n"; echo " 1. Run: ollama pull " . EMBED_MODEL . "\n"; echo " 2. Check Ollama version: ollama --version (need >= 0.1.20)\n"; echo " 3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n"; } exit(0); } // ── List mode ──────────────────────────────────────────────────────────────── if (isset($opts['list'])) { $stmt = $pdo->query( "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at FROM knowledge_chunks GROUP BY source, author ORDER BY source" ); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); if (!$rows) { echo "No sources indexed yet.\n"; } else { printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed'); echo str_repeat('-', 100) . "\n"; foreach ($rows as $r) { printf("%-55s %-25s %6d %s\n", substr($r['source'], 0, 54), substr($r['author'], 0, 24), $r['chunks'], $r['indexed_at'] ); } } exit(0); } // ── Clear mode ─────────────────────────────────────────────────────────────── if (!empty($opts['clear'])) { $title = $opts['clear']; $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $stmt->execute([$title]); $count = (int)$stmt->fetchColumn(); if ($count === 0) { echo "No chunks found for source: $title\n"; exit(0); } $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?'); $del->execute([$title]); echo "Deleted $count chunks for: $title\n"; exit(0); } // ── Collect PDF + EPUB files ────────────────────────────────────────────────── $files = []; $author = trim($opts['author'] ?? 'Unknown'); if (!empty($opts['file'])) { $path = $opts['file']; if (!is_file($path)) { die("File not found: $path\n"); } $files[] = $path; } if (!empty($opts['dir'])) { $dir = rtrim($opts['dir'], '/\\'); if (!is_dir($dir)) { die("Directory not found: $dir\n"); } $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir)); foreach ($it as $f) { if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) { $files[] = $f->getPathname(); } } if (!$files) { die("No PDF or EPUB files found in: $dir\n"); } } echo "Found " . count($files) . " file(s) to ingest.\n\n"; // ── Process each file ──────────────────────────────────────────────────────── $pdfParser = new Parser(); foreach ($files as $filePath) { $source = pathinfo($filePath, PATHINFO_FILENAME); $ext = strtolower(pathinfo($filePath, PATHINFO_EXTENSION)); echo "Processing: $source ($ext)\n"; // Check if already indexed $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $chk->execute([$source]); if ((int)$chk->fetchColumn() > 0) { echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n"; continue; } // Extract pages as array of ['page' => int, 'text' => string] try { if ($ext === 'epub') { $pages = extractEpubPages($filePath); } else { $pages = extractPdfPages($pdfParser, $filePath); } } catch (Exception $e) { echo " ERROR extracting text: " . $e->getMessage() . "\n\n"; continue; } echo " Sections/pages: " . count($pages) . "\n"; $insertStmt = $pdo->prepare( 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding) VALUES (?, ?, ?, ?, ?, ?)' ); $totalChunks = 0; $chunkIndex = 0; $wordBuffer = []; $bufferPages = []; foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) { if (strlen($pageText) < 50) continue; $words = explode(' ', $pageText); foreach ($words as $word) { $wordBuffer[] = $word; $bufferPages[] = $pageNum; } // Flush when buffer reaches chunk size while (count($wordBuffer) >= CHUNK_WORDS) { $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS); $chunkText = implode(' ', $chunkWords); $chunkPage = $bufferPages[0]; if (strlen(trim($chunkText)) > 50) { $embedding = getEmbedding($chunkText); if ($embedding === null) { echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n"; } else { $insertStmt->execute([ $source, $author, $chunkPage, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } // Slide window with overlap $step = CHUNK_WORDS - OVERLAP_WORDS; $wordBuffer = array_slice($wordBuffer, $step); $bufferPages = array_slice($bufferPages, $step); if ($chunkIndex % 20 === 0 && $chunkIndex > 0) { echo " ...{$chunkIndex} chunks embedded\n"; } } } // Flush remaining words as final chunk if (count($wordBuffer) > 30) { $chunkText = implode(' ', $wordBuffer); $embedding = getEmbedding($chunkText); if ($embedding !== null) { $insertStmt->execute([ $source, $author, $bufferPages[0] ?? 0, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } echo " Done: $totalChunks chunks stored.\n\n"; } echo "Ingestion complete.\n"; exit(0); // ── Helpers ────────────────────────────────────────────────────────────────── /** * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc. */ function cleanText(string $text): string { // Common PDF ligature replacements $ligatures = [ 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff', 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st', ]; $text = strtr($text, $ligatures); // Collapse multiple spaces / newlines into single space $text = preg_replace('/\s+/', ' ', $text); // Remove non-printable characters except newlines $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text); return trim($text); } /** * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string]. */ function extractPdfPages(Parser $parser, string $filePath): array { $pdf = $parser->parseFile($filePath); $result = []; foreach ($pdf->getPages() as $pageNum => $page) { $text = cleanText($page->getText()); if (strlen($text) >= 50) { $result[] = ['page' => $pageNum + 1, 'text' => $text]; } } return $result; } /** * Extract chapters/sections from an EPUB as pages. * Returns array of ['page' => int, 'text' => string]. * * EPUBs are ZIP archives containing XHTML spine items. We: * 1. Parse META-INF/container.xml to find the OPF file * 2. Parse the OPF manifest + spine for reading order * 3. Strip HTML tags from each spine XHTML file */ function extractEpubPages(string $filePath): array { $zip = new ZipArchive(); if ($zip->open($filePath) !== true) { throw new RuntimeException("Cannot open EPUB file: $filePath"); } // 1. Locate OPF via container.xml $containerXml = $zip->getFromName('META-INF/container.xml'); if ($containerXml === false) { $zip->close(); throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB"); } $dom = new DOMDocument(); @$dom->loadXML($containerXml); $xp = new DOMXPath($dom); $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container'); $nodes = $xp->query('//c:rootfile/@full-path'); if (!$nodes || $nodes->length === 0) { $zip->close(); throw new RuntimeException("Cannot find OPF path in container.xml"); } $opfPath = $nodes->item(0)->nodeValue; $opfDir = dirname($opfPath); if ($opfDir === '.') $opfDir = ''; // 2. Parse OPF for spine order $opfXml = $zip->getFromName($opfPath); if ($opfXml === false) { $zip->close(); throw new RuntimeException("Cannot read OPF file: $opfPath"); } $opfDom = new DOMDocument(); @$opfDom->loadXML($opfXml); $opfXp = new DOMXPath($opfDom); // Build manifest: id -> href $manifest = []; foreach ($opfXp->query('//*[local-name()="item"]') as $item) { $manifest[$item->getAttribute('id')] = $item->getAttribute('href'); } // Spine: ordered list of idrefs $spineHrefs = []; foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) { $idref = $ref->getAttribute('idref'); if (isset($manifest[$idref])) { $spineHrefs[] = $manifest[$idref]; } } // 3. Extract text from each spine item $result = []; foreach ($spineHrefs as $i => $href) { $href = urldecode($href); if (($pos = strpos($href, '#')) !== false) { $href = substr($href, 0, $pos); } $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href; $html = $zip->getFromName($fullPath); if ($html === false) continue; // Strip tags and decode HTML entities $text = strip_tags($html); $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = cleanText($text); if (strlen($text) >= 50) { $result[] = ['page' => $i + 1, 'text' => $text]; } } $zip->close(); if (empty($result)) { throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected"); } return $result; } /** * Call Ollama to embed text. Tries the newer /api/embed endpoint first * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array), * then falls back to the legacy /api/embeddings (uses "prompt" key, * returns "embedding" array). Returns float[] or null on failure. */ function getEmbedding(string $text): ?array { [$embedding] = getEmbeddingDebug($text); return $embedding; } /** * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode] * for diagnostic output. */ function getEmbeddingDebug(string $text): array { // ── Try new API: POST /api/embed {"model":..., "input":...} ──────────── $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]); $ch = curl_init(OLLAMA_HOST . '/api/embed'); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 5, ]); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($response && $httpCode === 200) { $data = json_decode($response, true); // New API returns { "embeddings": [[...]] } $emb = $data['embeddings'][0] ?? null; if (is_array($emb) && count($emb) > 0) { return [$emb, '/api/embed (new)', $response, $httpCode]; } } // ── Fallback: legacy /api/embeddings {"model":..., "prompt":...} ─────── $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]); $ch = curl_init(OLLAMA_HOST . '/api/embeddings'); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 5, ]); $response2 = curl_exec($ch); $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($response2 && $httpCode2 === 200) { $data2 = json_decode($response2, true); // Legacy API returns { "embedding": [...] } $emb2 = $data2['embedding'] ?? null; if (is_array($emb2) && count($emb2) > 0) { return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2]; } } // Return last response for diagnostics $lastResp = $response2 ?: $response ?: ''; $lastCode = $httpCode2 ?: $httpCode; return [null, 'both failed', $lastResp, $lastCode]; }