| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297 |
- <?php
- /**
- * tools/ingest_knowledge.php
- *
- * CLI script: ingests soil science PDF books into the knowledge_chunks table.
- * Each page is split into overlapping chunks, embedded via Ollama, and stored.
- *
- * Usage:
- * php tools/ingest_knowledge.php --file="path/to/book.pdf" --author="William A. Albrecht"
- * php tools/ingest_knowledge.php --dir="path/to/books/" --author="Various"
- * php tools/ingest_knowledge.php --list (show all indexed sources)
- * php tools/ingest_knowledge.php --clear="Book Title" (remove a source)
- *
- * Requirements:
- * composer require smalot/pdfparser
- * Ollama running with nomic-embed-text pulled:
- * ollama pull nomic-embed-text
- *
- * The embedding model (nomic-embed-text) produces 768-dimensional vectors.
- * Each chunk is ~500 words with a 100-word overlap to preserve context across boundaries.
- */
- // ── Must run from CLI ────────────────────────────────────────────────────────
- if (PHP_SAPI !== 'cli') {
- die("This script must be run from the command line.\n");
- }
- define('ROOT', dirname(__DIR__));
- require ROOT . '/vendor/autoload.php';
- require ROOT . '/config/database.php';
- use Smalot\PdfParser\Parser;
- // ── Config ───────────────────────────────────────────────────────────────────
- define('OLLAMA_EMBED_URL', 'http://192.168.8.73:11434/api/embeddings');
- define('EMBED_MODEL', 'nomic-embed-text');
- define('CHUNK_WORDS', 500); // target words per chunk
- define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
- // ── Parse args ───────────────────────────────────────────────────────────────
- $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'help']);
- if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']))) {
- echo <<<HELP
- Usage:
- php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
- php tools/ingest_knowledge.php --dir="books/" --author="Various"
- php tools/ingest_knowledge.php --list
- php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
- Options:
- --file Path to a single PDF file
- --dir Path to a directory of PDF files (processed recursively)
- --author Author name to tag all chunks from this run
- --list List all indexed sources with chunk counts
- --clear Remove all chunks from a named source
- HELP;
- exit(0);
- }
- $pdo = getDBConnection();
- // ── List mode ────────────────────────────────────────────────────────────────
- if (isset($opts['list'])) {
- $stmt = $pdo->query(
- "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at
- FROM knowledge_chunks GROUP BY source, author ORDER BY source"
- );
- $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
- if (!$rows) {
- echo "No sources indexed yet.\n";
- } else {
- printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed');
- echo str_repeat('-', 100) . "\n";
- foreach ($rows as $r) {
- printf("%-55s %-25s %6d %s\n",
- substr($r['source'], 0, 54),
- substr($r['author'], 0, 24),
- $r['chunks'],
- $r['indexed_at']
- );
- }
- }
- exit(0);
- }
- // ── Clear mode ───────────────────────────────────────────────────────────────
- if (!empty($opts['clear'])) {
- $title = $opts['clear'];
- $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
- $stmt->execute([$title]);
- $count = (int)$stmt->fetchColumn();
- if ($count === 0) {
- echo "No chunks found for source: $title\n";
- exit(0);
- }
- $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?');
- $del->execute([$title]);
- echo "Deleted $count chunks for: $title\n";
- exit(0);
- }
- // ── Collect PDF files ────────────────────────────────────────────────────────
- $files = [];
- $author = trim($opts['author'] ?? 'Unknown');
- if (!empty($opts['file'])) {
- $path = $opts['file'];
- if (!is_file($path)) {
- die("File not found: $path\n");
- }
- $files[] = $path;
- }
- if (!empty($opts['dir'])) {
- $dir = rtrim($opts['dir'], '/\\');
- if (!is_dir($dir)) {
- die("Directory not found: $dir\n");
- }
- $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
- foreach ($it as $f) {
- if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
- $files[] = $f->getPathname();
- }
- }
- if (!$files) {
- die("No PDF files found in: $dir\n");
- }
- }
- echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
- // ── Process each file ────────────────────────────────────────────────────────
- $parser = new Parser();
- foreach ($files as $filePath) {
- $source = pathinfo($filePath, PATHINFO_FILENAME);
- echo "Processing: $source\n";
- // Check if already indexed
- $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
- $chk->execute([$source]);
- if ((int)$chk->fetchColumn() > 0) {
- echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n";
- continue;
- }
- try {
- $pdf = $parser->parseFile($filePath);
- $pages = $pdf->getPages();
- } catch (Exception $e) {
- echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n";
- continue;
- }
- echo " Pages: " . count($pages) . "\n";
- $totalChunks = 0;
- $totalTokens = 0;
- $pageBuffer = []; // accumulate pages into a rolling word buffer
- $insertStmt = $pdo->prepare(
- 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
- VALUES (?, ?, ?, ?, ?, ?)'
- );
- $chunkIndex = 0;
- $wordBuffer = [];
- $bufferPages = []; // page numbers corresponding to words in buffer
- foreach ($pages as $pageNum => $page) {
- $pageText = cleanText($page->getText());
- if (strlen($pageText) < 50) continue; // skip blank/image-only pages
- $words = explode(' ', $pageText);
- foreach ($words as $word) {
- $wordBuffer[] = $word;
- $bufferPages[] = $pageNum + 1;
- }
- // Flush when buffer reaches chunk size
- while (count($wordBuffer) >= CHUNK_WORDS) {
- $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
- $chunkText = implode(' ', $chunkWords);
- $chunkPage = $bufferPages[0];
- if (strlen(trim($chunkText)) > 50) {
- $embedding = getEmbedding($chunkText);
- if ($embedding === null) {
- echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
- } else {
- $insertStmt->execute([
- $source,
- $author,
- $chunkPage,
- $chunkIndex,
- $chunkText,
- json_encode($embedding),
- ]);
- $chunkIndex++;
- $totalChunks++;
- }
- }
- // Slide window with overlap
- $step = CHUNK_WORDS - OVERLAP_WORDS;
- $wordBuffer = array_slice($wordBuffer, $step);
- $bufferPages = array_slice($bufferPages, $step);
- if ($chunkIndex % 20 === 0 && $chunkIndex > 0) {
- echo " ...{$chunkIndex} chunks embedded\n";
- }
- }
- }
- // Flush remaining words as final chunk
- if (count($wordBuffer) > 30) {
- $chunkText = implode(' ', $wordBuffer);
- $embedding = getEmbedding($chunkText);
- if ($embedding !== null) {
- $insertStmt->execute([
- $source, $author, $bufferPages[0] ?? 0, $chunkIndex,
- $chunkText, json_encode($embedding),
- ]);
- $chunkIndex++;
- $totalChunks++;
- }
- }
- echo " Done: $totalChunks chunks stored.\n\n";
- }
- echo "Ingestion complete.\n";
- exit(0);
- // ── Helpers ──────────────────────────────────────────────────────────────────
- /**
- * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc.
- */
- function cleanText(string $text): string
- {
- // Common PDF ligature replacements
- $ligatures = [
- 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff',
- 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st',
- ];
- $text = strtr($text, $ligatures);
- // Collapse multiple spaces / newlines into single space
- $text = preg_replace('/\s+/', ' ', $text);
- // Remove non-printable characters except newlines
- $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text);
- return trim($text);
- }
- /**
- * Call Ollama's /api/embeddings and return float[] or null on failure.
- */
- function getEmbedding(string $text): ?array
- {
- $payload = json_encode([
- 'model' => EMBED_MODEL,
- 'prompt' => $text,
- ]);
- $ch = curl_init(OLLAMA_EMBED_URL);
- curl_setopt_array($ch, [
- CURLOPT_POST => true,
- CURLOPT_POSTFIELDS => $payload,
- CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_TIMEOUT => 30,
- CURLOPT_CONNECTTIMEOUT => 5,
- ]);
- $response = curl_exec($ch);
- $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- curl_close($ch);
- if (!$response || $httpCode !== 200) {
- return null;
- }
- $data = json_decode($response, true);
- $embedding = $data['embedding'] ?? null;
- if (!is_array($embedding) || count($embedding) === 0) {
- return null;
- }
- return $embedding;
- }
|