query( "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at FROM knowledge_chunks GROUP BY source, author ORDER BY source" ); $rows = $stmt->fetchAll(PDO::FETCH_ASSOC); if (!$rows) { echo "No sources indexed yet.\n"; } else { printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed'); echo str_repeat('-', 100) . "\n"; foreach ($rows as $r) { printf("%-55s %-25s %6d %s\n", substr($r['source'], 0, 54), substr($r['author'], 0, 24), $r['chunks'], $r['indexed_at'] ); } } exit(0); } // ── Clear mode ─────────────────────────────────────────────────────────────── if (!empty($opts['clear'])) { $title = $opts['clear']; $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $stmt->execute([$title]); $count = (int)$stmt->fetchColumn(); if ($count === 0) { echo "No chunks found for source: $title\n"; exit(0); } $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?'); $del->execute([$title]); echo "Deleted $count chunks for: $title\n"; exit(0); } // ── Collect PDF files ──────────────────────────────────────────────────────── $files = []; $author = trim($opts['author'] ?? 'Unknown'); if (!empty($opts['file'])) { $path = $opts['file']; if (!is_file($path)) { die("File not found: $path\n"); } $files[] = $path; } if (!empty($opts['dir'])) { $dir = rtrim($opts['dir'], '/\\'); if (!is_dir($dir)) { die("Directory not found: $dir\n"); } $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir)); foreach ($it as $f) { if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') { $files[] = $f->getPathname(); } } if (!$files) { die("No PDF files found in: $dir\n"); } } echo "Found " . count($files) . " PDF file(s) to ingest.\n\n"; // ── Process each file ──────────────────────────────────────────────────────── $parser = new Parser(); foreach ($files as $filePath) { $source = pathinfo($filePath, PATHINFO_FILENAME); echo "Processing: $source\n"; // Check if already indexed $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?'); $chk->execute([$source]); if ((int)$chk->fetchColumn() > 0) { echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n"; continue; } try { $pdf = $parser->parseFile($filePath); $pages = $pdf->getPages(); } catch (Exception $e) { echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n"; continue; } echo " Pages: " . count($pages) . "\n"; $totalChunks = 0; $totalTokens = 0; $pageBuffer = []; // accumulate pages into a rolling word buffer $insertStmt = $pdo->prepare( 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding) VALUES (?, ?, ?, ?, ?, ?)' ); $chunkIndex = 0; $wordBuffer = []; $bufferPages = []; // page numbers corresponding to words in buffer foreach ($pages as $pageNum => $page) { $pageText = cleanText($page->getText()); if (strlen($pageText) < 50) continue; // skip blank/image-only pages $words = explode(' ', $pageText); foreach ($words as $word) { $wordBuffer[] = $word; $bufferPages[] = $pageNum + 1; } // Flush when buffer reaches chunk size while (count($wordBuffer) >= CHUNK_WORDS) { $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS); $chunkText = implode(' ', $chunkWords); $chunkPage = $bufferPages[0]; if (strlen(trim($chunkText)) > 50) { $embedding = getEmbedding($chunkText); if ($embedding === null) { echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n"; } else { $insertStmt->execute([ $source, $author, $chunkPage, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } // Slide window with overlap $step = CHUNK_WORDS - OVERLAP_WORDS; $wordBuffer = array_slice($wordBuffer, $step); $bufferPages = array_slice($bufferPages, $step); if ($chunkIndex % 20 === 0 && $chunkIndex > 0) { echo " ...{$chunkIndex} chunks embedded\n"; } } } // Flush remaining words as final chunk if (count($wordBuffer) > 30) { $chunkText = implode(' ', $wordBuffer); $embedding = getEmbedding($chunkText); if ($embedding !== null) { $insertStmt->execute([ $source, $author, $bufferPages[0] ?? 0, $chunkIndex, $chunkText, json_encode($embedding), ]); $chunkIndex++; $totalChunks++; } } echo " Done: $totalChunks chunks stored.\n\n"; } echo "Ingestion complete.\n"; exit(0); // ── Helpers ────────────────────────────────────────────────────────────────── /** * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc. */ function cleanText(string $text): string { // Common PDF ligature replacements $ligatures = [ 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff', 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st', ]; $text = strtr($text, $ligatures); // Collapse multiple spaces / newlines into single space $text = preg_replace('/\s+/', ' ', $text); // Remove non-printable characters except newlines $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text); return trim($text); } /** * Call Ollama's /api/embeddings and return float[] or null on failure. */ function getEmbedding(string $text): ?array { $payload = json_encode([ 'model' => EMBED_MODEL, 'prompt' => $text, ]); $ch = curl_init(OLLAMA_EMBED_URL); curl_setopt_array($ch, [ CURLOPT_POST => true, CURLOPT_POSTFIELDS => $payload, CURLOPT_HTTPHEADER => ['Content-Type: application/json'], CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 30, CURLOPT_CONNECTTIMEOUT => 5, ]); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if (!$response || $httpCode !== 200) { return null; } $data = json_decode($response, true); $embedding = $data['embedding'] ?? null; if (!is_array($embedding) || count($embedding) === 0) { return null; } return $embedding; }