ingest_knowledge.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. <?php
  2. /**
  3. * tools/ingest_knowledge.php
  4. *
  5. * CLI script: ingests soil science PDF books into the knowledge_chunks table.
  6. * Each page is split into overlapping chunks, embedded via Ollama, and stored.
  7. *
  8. * Usage:
  9. * php tools/ingest_knowledge.php --file="path/to/book.pdf" --author="William A. Albrecht"
  10. * php tools/ingest_knowledge.php --dir="path/to/books/" --author="Various"
  11. * php tools/ingest_knowledge.php --list (show all indexed sources)
  12. * php tools/ingest_knowledge.php --clear="Book Title" (remove a source)
  13. *
  14. * Requirements:
  15. * composer require smalot/pdfparser
  16. * Ollama running with nomic-embed-text pulled:
  17. * ollama pull nomic-embed-text
  18. *
  19. * The embedding model (nomic-embed-text) produces 768-dimensional vectors.
  20. * Each chunk is ~500 words with a 100-word overlap to preserve context across boundaries.
  21. */
  22. // ── Must run from CLI ────────────────────────────────────────────────────────
  23. if (PHP_SAPI !== 'cli') {
  24. die("This script must be run from the command line.\n");
  25. }
  26. define('ROOT', dirname(__DIR__));
  27. require ROOT . '/vendor/autoload.php';
  28. require ROOT . '/config/database.php';
  29. use Smalot\PdfParser\Parser;
  30. // ── Config ───────────────────────────────────────────────────────────────────
  31. define('OLLAMA_HOST', 'http://192.168.8.73:11434');
  32. define('EMBED_MODEL', 'nomic-embed-text');
  33. define('CHUNK_WORDS', 500); // target words per chunk
  34. define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
  35. // ── Parse args ───────────────────────────────────────────────────────────────
  36. $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help']);
  37. if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
  38. echo <<<HELP
  39. Usage:
  40. php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
  41. php tools/ingest_knowledge.php --dir="books/" --author="Various"
  42. php tools/ingest_knowledge.php --list
  43. php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
  44. php tools/ingest_knowledge.php --test (verify Ollama connection + embedding)
  45. Options:
  46. --file Path to a single PDF file
  47. --dir Path to a directory of PDF files (processed recursively)
  48. --author Author name to tag all chunks from this run
  49. --list List all indexed sources with chunk counts
  50. --clear Remove all chunks from a named source
  51. HELP;
  52. exit(0);
  53. }
  54. $pdo = getDBConnection();
  55. // ── Test mode ─────────────────────────────────────────────────────────────────
  56. if (isset($opts['test'])) {
  57. echo "Testing Ollama connection at " . OLLAMA_HOST . " ...\n\n";
  58. // 1. List available models
  59. $ch = curl_init(OLLAMA_HOST . '/api/tags');
  60. curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 5]);
  61. $resp = curl_exec($ch);
  62. $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  63. $err = curl_error($ch);
  64. curl_close($ch);
  65. if ($err || $code !== 200) {
  66. echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n";
  67. exit(1);
  68. }
  69. $models = json_decode($resp, true);
  70. $names = array_column($models['models'] ?? [], 'name');
  71. echo "OK: Ollama reachable. Models installed:\n";
  72. foreach ($names as $name) echo " - $name\n";
  73. $embedFound = false;
  74. foreach ($names as $n) {
  75. if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; }
  76. }
  77. if (!$embedFound) {
  78. echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n";
  79. echo "Run on your Ollama server: ollama pull " . EMBED_MODEL . "\n\n";
  80. }
  81. // 2. Test embedding
  82. echo "\nTesting embedding endpoint ...\n";
  83. [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence");
  84. echo "HTTP code: $httpCode\n";
  85. echo "API used: $apiUsed\n";
  86. if ($embedding !== null) {
  87. echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n";
  88. echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n";
  89. } else {
  90. echo "FAIL: No embedding returned.\n";
  91. echo "Raw response: $rawResp\n";
  92. echo "\nPossible fixes:\n";
  93. echo " 1. Run: ollama pull " . EMBED_MODEL . "\n";
  94. echo " 2. Check Ollama version: ollama --version (need >= 0.1.20)\n";
  95. echo " 3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n";
  96. }
  97. exit(0);
  98. }
  99. // ── List mode ────────────────────────────────────────────────────────────────
  100. if (isset($opts['list'])) {
  101. $stmt = $pdo->query(
  102. "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at
  103. FROM knowledge_chunks GROUP BY source, author ORDER BY source"
  104. );
  105. $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
  106. if (!$rows) {
  107. echo "No sources indexed yet.\n";
  108. } else {
  109. printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed');
  110. echo str_repeat('-', 100) . "\n";
  111. foreach ($rows as $r) {
  112. printf("%-55s %-25s %6d %s\n",
  113. substr($r['source'], 0, 54),
  114. substr($r['author'], 0, 24),
  115. $r['chunks'],
  116. $r['indexed_at']
  117. );
  118. }
  119. }
  120. exit(0);
  121. }
  122. // ── Clear mode ───────────────────────────────────────────────────────────────
  123. if (!empty($opts['clear'])) {
  124. $title = $opts['clear'];
  125. $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  126. $stmt->execute([$title]);
  127. $count = (int)$stmt->fetchColumn();
  128. if ($count === 0) {
  129. echo "No chunks found for source: $title\n";
  130. exit(0);
  131. }
  132. $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?');
  133. $del->execute([$title]);
  134. echo "Deleted $count chunks for: $title\n";
  135. exit(0);
  136. }
  137. // ── Collect PDF files ────────────────────────────────────────────────────────
  138. $files = [];
  139. $author = trim($opts['author'] ?? 'Unknown');
  140. if (!empty($opts['file'])) {
  141. $path = $opts['file'];
  142. if (!is_file($path)) {
  143. die("File not found: $path\n");
  144. }
  145. $files[] = $path;
  146. }
  147. if (!empty($opts['dir'])) {
  148. $dir = rtrim($opts['dir'], '/\\');
  149. if (!is_dir($dir)) {
  150. die("Directory not found: $dir\n");
  151. }
  152. $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
  153. foreach ($it as $f) {
  154. if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
  155. $files[] = $f->getPathname();
  156. }
  157. }
  158. if (!$files) {
  159. die("No PDF files found in: $dir\n");
  160. }
  161. }
  162. echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
  163. // ── Process each file ────────────────────────────────────────────────────────
  164. $parser = new Parser();
  165. foreach ($files as $filePath) {
  166. $source = pathinfo($filePath, PATHINFO_FILENAME);
  167. echo "Processing: $source\n";
  168. // Check if already indexed
  169. $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  170. $chk->execute([$source]);
  171. if ((int)$chk->fetchColumn() > 0) {
  172. echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n";
  173. continue;
  174. }
  175. try {
  176. $pdf = $parser->parseFile($filePath);
  177. $pages = $pdf->getPages();
  178. } catch (Exception $e) {
  179. echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n";
  180. continue;
  181. }
  182. echo " Pages: " . count($pages) . "\n";
  183. $totalChunks = 0;
  184. $totalTokens = 0;
  185. $pageBuffer = []; // accumulate pages into a rolling word buffer
  186. $insertStmt = $pdo->prepare(
  187. 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
  188. VALUES (?, ?, ?, ?, ?, ?)'
  189. );
  190. $chunkIndex = 0;
  191. $wordBuffer = [];
  192. $bufferPages = []; // page numbers corresponding to words in buffer
  193. foreach ($pages as $pageNum => $page) {
  194. $pageText = cleanText($page->getText());
  195. if (strlen($pageText) < 50) continue; // skip blank/image-only pages
  196. $words = explode(' ', $pageText);
  197. foreach ($words as $word) {
  198. $wordBuffer[] = $word;
  199. $bufferPages[] = $pageNum + 1;
  200. }
  201. // Flush when buffer reaches chunk size
  202. while (count($wordBuffer) >= CHUNK_WORDS) {
  203. $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
  204. $chunkText = implode(' ', $chunkWords);
  205. $chunkPage = $bufferPages[0];
  206. if (strlen(trim($chunkText)) > 50) {
  207. $embedding = getEmbedding($chunkText);
  208. if ($embedding === null) {
  209. echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
  210. } else {
  211. $insertStmt->execute([
  212. $source,
  213. $author,
  214. $chunkPage,
  215. $chunkIndex,
  216. $chunkText,
  217. json_encode($embedding),
  218. ]);
  219. $chunkIndex++;
  220. $totalChunks++;
  221. }
  222. }
  223. // Slide window with overlap
  224. $step = CHUNK_WORDS - OVERLAP_WORDS;
  225. $wordBuffer = array_slice($wordBuffer, $step);
  226. $bufferPages = array_slice($bufferPages, $step);
  227. if ($chunkIndex % 20 === 0 && $chunkIndex > 0) {
  228. echo " ...{$chunkIndex} chunks embedded\n";
  229. }
  230. }
  231. }
  232. // Flush remaining words as final chunk
  233. if (count($wordBuffer) > 30) {
  234. $chunkText = implode(' ', $wordBuffer);
  235. $embedding = getEmbedding($chunkText);
  236. if ($embedding !== null) {
  237. $insertStmt->execute([
  238. $source, $author, $bufferPages[0] ?? 0, $chunkIndex,
  239. $chunkText, json_encode($embedding),
  240. ]);
  241. $chunkIndex++;
  242. $totalChunks++;
  243. }
  244. }
  245. echo " Done: $totalChunks chunks stored.\n\n";
  246. }
  247. echo "Ingestion complete.\n";
  248. exit(0);
  249. // ── Helpers ──────────────────────────────────────────────────────────────────
  250. /**
  251. * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc.
  252. */
  253. function cleanText(string $text): string
  254. {
  255. // Common PDF ligature replacements
  256. $ligatures = [
  257. 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff',
  258. 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st',
  259. ];
  260. $text = strtr($text, $ligatures);
  261. // Collapse multiple spaces / newlines into single space
  262. $text = preg_replace('/\s+/', ' ', $text);
  263. // Remove non-printable characters except newlines
  264. $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text);
  265. return trim($text);
  266. }
  267. /**
  268. * Call Ollama to embed text. Tries the newer /api/embed endpoint first
  269. * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
  270. * then falls back to the legacy /api/embeddings (uses "prompt" key,
  271. * returns "embedding" array). Returns float[] or null on failure.
  272. */
  273. function getEmbedding(string $text): ?array
  274. {
  275. [$embedding] = getEmbeddingDebug($text);
  276. return $embedding;
  277. }
  278. /**
  279. * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode]
  280. * for diagnostic output.
  281. */
  282. function getEmbeddingDebug(string $text): array
  283. {
  284. // ── Try new API: POST /api/embed {"model":..., "input":...} ────────────
  285. $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]);
  286. $ch = curl_init(OLLAMA_HOST . '/api/embed');
  287. curl_setopt_array($ch, [
  288. CURLOPT_POST => true,
  289. CURLOPT_POSTFIELDS => $payload,
  290. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  291. CURLOPT_RETURNTRANSFER => true,
  292. CURLOPT_TIMEOUT => 30,
  293. CURLOPT_CONNECTTIMEOUT => 5,
  294. ]);
  295. $response = curl_exec($ch);
  296. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  297. curl_close($ch);
  298. if ($response && $httpCode === 200) {
  299. $data = json_decode($response, true);
  300. // New API returns { "embeddings": [[...]] }
  301. $emb = $data['embeddings'][0] ?? null;
  302. if (is_array($emb) && count($emb) > 0) {
  303. return [$emb, '/api/embed (new)', $response, $httpCode];
  304. }
  305. }
  306. // ── Fallback: legacy /api/embeddings {"model":..., "prompt":...} ───────
  307. $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]);
  308. $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
  309. curl_setopt_array($ch, [
  310. CURLOPT_POST => true,
  311. CURLOPT_POSTFIELDS => $payload,
  312. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  313. CURLOPT_RETURNTRANSFER => true,
  314. CURLOPT_TIMEOUT => 30,
  315. CURLOPT_CONNECTTIMEOUT => 5,
  316. ]);
  317. $response2 = curl_exec($ch);
  318. $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  319. curl_close($ch);
  320. if ($response2 && $httpCode2 === 200) {
  321. $data2 = json_decode($response2, true);
  322. // Legacy API returns { "embedding": [...] }
  323. $emb2 = $data2['embedding'] ?? null;
  324. if (is_array($emb2) && count($emb2) > 0) {
  325. return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2];
  326. }
  327. }
  328. // Return last response for diagnostics
  329. $lastResp = $response2 ?: $response ?: '';
  330. $lastCode = $httpCode2 ?: $httpCode;
  331. return [null, 'both failed', $lastResp, $lastCode];
  332. }