ingest_knowledge.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. <?php
  2. /**
  3. * tools/ingest_knowledge.php
  4. *
  5. * CLI script: ingests soil science PDF books into the knowledge_chunks table.
  6. * Each page is split into overlapping chunks, embedded via Ollama, and stored.
  7. *
  8. * Usage:
  9. * php tools/ingest_knowledge.php --file="path/to/book.pdf" --author="William A. Albrecht"
  10. * php tools/ingest_knowledge.php --dir="path/to/books/" --author="Various"
  11. * php tools/ingest_knowledge.php --list (show all indexed sources)
  12. * php tools/ingest_knowledge.php --clear="Book Title" (remove a source)
  13. *
  14. * Requirements:
  15. * composer require smalot/pdfparser
  16. * Ollama running with nomic-embed-text pulled:
  17. * ollama pull nomic-embed-text
  18. *
  19. * The embedding model (nomic-embed-text) produces 768-dimensional vectors.
  20. * Each chunk is ~500 words with a 100-word overlap to preserve context across boundaries.
  21. */
  22. // ── Must run from CLI ────────────────────────────────────────────────────────
  23. if (PHP_SAPI !== 'cli') {
  24. die("This script must be run from the command line.\n");
  25. }
  26. define('ROOT', dirname(__DIR__));
  27. require ROOT . '/vendor/autoload.php';
  28. require ROOT . '/config/database.php';
  29. use Smalot\PdfParser\Parser;
  30. // ── Config ───────────────────────────────────────────────────────────────────
  31. define('OLLAMA_EMBED_URL', 'http://192.168.8.73:11434/api/embeddings');
  32. define('EMBED_MODEL', 'nomic-embed-text');
  33. define('CHUNK_WORDS', 500); // target words per chunk
  34. define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
  35. // ── Parse args ───────────────────────────────────────────────────────────────
  36. $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'help']);
  37. if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']))) {
  38. echo <<<HELP
  39. Usage:
  40. php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
  41. php tools/ingest_knowledge.php --dir="books/" --author="Various"
  42. php tools/ingest_knowledge.php --list
  43. php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
  44. Options:
  45. --file Path to a single PDF file
  46. --dir Path to a directory of PDF files (processed recursively)
  47. --author Author name to tag all chunks from this run
  48. --list List all indexed sources with chunk counts
  49. --clear Remove all chunks from a named source
  50. HELP;
  51. exit(0);
  52. }
  53. $pdo = getDBConnection();
  54. // ── List mode ────────────────────────────────────────────────────────────────
  55. if (isset($opts['list'])) {
  56. $stmt = $pdo->query(
  57. "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at
  58. FROM knowledge_chunks GROUP BY source, author ORDER BY source"
  59. );
  60. $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
  61. if (!$rows) {
  62. echo "No sources indexed yet.\n";
  63. } else {
  64. printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed');
  65. echo str_repeat('-', 100) . "\n";
  66. foreach ($rows as $r) {
  67. printf("%-55s %-25s %6d %s\n",
  68. substr($r['source'], 0, 54),
  69. substr($r['author'], 0, 24),
  70. $r['chunks'],
  71. $r['indexed_at']
  72. );
  73. }
  74. }
  75. exit(0);
  76. }
  77. // ── Clear mode ───────────────────────────────────────────────────────────────
  78. if (!empty($opts['clear'])) {
  79. $title = $opts['clear'];
  80. $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  81. $stmt->execute([$title]);
  82. $count = (int)$stmt->fetchColumn();
  83. if ($count === 0) {
  84. echo "No chunks found for source: $title\n";
  85. exit(0);
  86. }
  87. $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?');
  88. $del->execute([$title]);
  89. echo "Deleted $count chunks for: $title\n";
  90. exit(0);
  91. }
  92. // ── Collect PDF files ────────────────────────────────────────────────────────
  93. $files = [];
  94. $author = trim($opts['author'] ?? 'Unknown');
  95. if (!empty($opts['file'])) {
  96. $path = $opts['file'];
  97. if (!is_file($path)) {
  98. die("File not found: $path\n");
  99. }
  100. $files[] = $path;
  101. }
  102. if (!empty($opts['dir'])) {
  103. $dir = rtrim($opts['dir'], '/\\');
  104. if (!is_dir($dir)) {
  105. die("Directory not found: $dir\n");
  106. }
  107. $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
  108. foreach ($it as $f) {
  109. if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
  110. $files[] = $f->getPathname();
  111. }
  112. }
  113. if (!$files) {
  114. die("No PDF files found in: $dir\n");
  115. }
  116. }
  117. echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
  118. // ── Process each file ────────────────────────────────────────────────────────
  119. $parser = new Parser();
  120. foreach ($files as $filePath) {
  121. $source = pathinfo($filePath, PATHINFO_FILENAME);
  122. echo "Processing: $source\n";
  123. // Check if already indexed
  124. $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  125. $chk->execute([$source]);
  126. if ((int)$chk->fetchColumn() > 0) {
  127. echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n";
  128. continue;
  129. }
  130. try {
  131. $pdf = $parser->parseFile($filePath);
  132. $pages = $pdf->getPages();
  133. } catch (Exception $e) {
  134. echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n";
  135. continue;
  136. }
  137. echo " Pages: " . count($pages) . "\n";
  138. $totalChunks = 0;
  139. $totalTokens = 0;
  140. $pageBuffer = []; // accumulate pages into a rolling word buffer
  141. $insertStmt = $pdo->prepare(
  142. 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
  143. VALUES (?, ?, ?, ?, ?, ?)'
  144. );
  145. $chunkIndex = 0;
  146. $wordBuffer = [];
  147. $bufferPages = []; // page numbers corresponding to words in buffer
  148. foreach ($pages as $pageNum => $page) {
  149. $pageText = cleanText($page->getText());
  150. if (strlen($pageText) < 50) continue; // skip blank/image-only pages
  151. $words = explode(' ', $pageText);
  152. foreach ($words as $word) {
  153. $wordBuffer[] = $word;
  154. $bufferPages[] = $pageNum + 1;
  155. }
  156. // Flush when buffer reaches chunk size
  157. while (count($wordBuffer) >= CHUNK_WORDS) {
  158. $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
  159. $chunkText = implode(' ', $chunkWords);
  160. $chunkPage = $bufferPages[0];
  161. if (strlen(trim($chunkText)) > 50) {
  162. $embedding = getEmbedding($chunkText);
  163. if ($embedding === null) {
  164. echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
  165. } else {
  166. $insertStmt->execute([
  167. $source,
  168. $author,
  169. $chunkPage,
  170. $chunkIndex,
  171. $chunkText,
  172. json_encode($embedding),
  173. ]);
  174. $chunkIndex++;
  175. $totalChunks++;
  176. }
  177. }
  178. // Slide window with overlap
  179. $step = CHUNK_WORDS - OVERLAP_WORDS;
  180. $wordBuffer = array_slice($wordBuffer, $step);
  181. $bufferPages = array_slice($bufferPages, $step);
  182. if ($chunkIndex % 20 === 0 && $chunkIndex > 0) {
  183. echo " ...{$chunkIndex} chunks embedded\n";
  184. }
  185. }
  186. }
  187. // Flush remaining words as final chunk
  188. if (count($wordBuffer) > 30) {
  189. $chunkText = implode(' ', $wordBuffer);
  190. $embedding = getEmbedding($chunkText);
  191. if ($embedding !== null) {
  192. $insertStmt->execute([
  193. $source, $author, $bufferPages[0] ?? 0, $chunkIndex,
  194. $chunkText, json_encode($embedding),
  195. ]);
  196. $chunkIndex++;
  197. $totalChunks++;
  198. }
  199. }
  200. echo " Done: $totalChunks chunks stored.\n\n";
  201. }
  202. echo "Ingestion complete.\n";
  203. exit(0);
  204. // ── Helpers ──────────────────────────────────────────────────────────────────
  205. /**
  206. * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc.
  207. */
  208. function cleanText(string $text): string
  209. {
  210. // Common PDF ligature replacements
  211. $ligatures = [
  212. 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff',
  213. 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st',
  214. ];
  215. $text = strtr($text, $ligatures);
  216. // Collapse multiple spaces / newlines into single space
  217. $text = preg_replace('/\s+/', ' ', $text);
  218. // Remove non-printable characters except newlines
  219. $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text);
  220. return trim($text);
  221. }
  222. /**
  223. * Call Ollama's /api/embeddings and return float[] or null on failure.
  224. */
  225. function getEmbedding(string $text): ?array
  226. {
  227. $payload = json_encode([
  228. 'model' => EMBED_MODEL,
  229. 'prompt' => $text,
  230. ]);
  231. $ch = curl_init(OLLAMA_EMBED_URL);
  232. curl_setopt_array($ch, [
  233. CURLOPT_POST => true,
  234. CURLOPT_POSTFIELDS => $payload,
  235. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  236. CURLOPT_RETURNTRANSFER => true,
  237. CURLOPT_TIMEOUT => 30,
  238. CURLOPT_CONNECTTIMEOUT => 5,
  239. ]);
  240. $response = curl_exec($ch);
  241. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  242. curl_close($ch);
  243. if (!$response || $httpCode !== 200) {
  244. return null;
  245. }
  246. $data = json_decode($response, true);
  247. $embedding = $data['embedding'] ?? null;
  248. if (!is_array($embedding) || count($embedding) === 0) {
  249. return null;
  250. }
  251. return $embedding;
  252. }