ingest_knowledge.php 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. <?php
  2. /**
  3. * tools/ingest_knowledge.php
  4. *
  5. * CLI script: ingests soil science PDF books into the knowledge_chunks table.
  6. * Each page is split into overlapping chunks, embedded via Ollama, and stored.
  7. *
  8. * Usage:
  9. * php tools/ingest_knowledge.php --file="path/to/book.pdf" --author="William A. Albrecht"
  10. * php tools/ingest_knowledge.php --dir="path/to/books/" --author="Various"
  11. * php tools/ingest_knowledge.php --list (show all indexed sources)
  12. * php tools/ingest_knowledge.php --clear="Book Title" (remove a source)
  13. *
  14. * Requirements:
  15. * composer require smalot/pdfparser
  16. * Ollama running with nomic-embed-text pulled:
  17. * ollama pull nomic-embed-text
  18. *
  19. * The embedding model (nomic-embed-text) produces 768-dimensional vectors.
  20. * Each chunk is ~500 words with a 100-word overlap to preserve context across boundaries.
  21. */
  22. // ── Must run from CLI ────────────────────────────────────────────────────────
  23. if (PHP_SAPI !== 'cli') {
  24. die("This script must be run from the command line.\n");
  25. }
  26. define('ROOT', dirname(__DIR__));
  27. require ROOT . '/vendor/autoload.php';
  28. require ROOT . '/config/database.php';
  29. use Smalot\PdfParser\Parser;
  30. // ── Config ───────────────────────────────────────────────────────────────────
  31. define('OLLAMA_HOST', 'http://192.168.8.73:11434');
  32. define('EMBED_MODEL', 'nomic-embed-text');
  33. define('CHUNK_WORDS', 500); // target words per chunk
  34. define('OVERLAP_WORDS', 80); // overlap between consecutive chunks
  35. // ── Parse args ───────────────────────────────────────────────────────────────
  36. $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help']);
  37. if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
  38. echo <<<HELP
  39. Usage:
  40. php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
  41. php tools/ingest_knowledge.php --file="book.epub" --author="William A. Albrecht"
  42. php tools/ingest_knowledge.php --dir="books/" --author="Various"
  43. php tools/ingest_knowledge.php --list
  44. php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
  45. php tools/ingest_knowledge.php --test (verify Ollama connection + embedding)
  46. Options:
  47. --file Path to a single PDF or EPUB file
  48. --dir Path to a directory of PDF/EPUB files (processed recursively)
  49. --author Author name to tag all chunks from this run
  50. --list List all indexed sources with chunk counts
  51. --clear Remove all chunks from a named source
  52. HELP;
  53. exit(0);
  54. }
  55. $pdo = getDBConnection();
  56. // ── Test mode ─────────────────────────────────────────────────────────────────
  57. if (isset($opts['test'])) {
  58. echo "Testing Ollama connection at " . OLLAMA_HOST . " ...\n\n";
  59. // 1. List available models
  60. $ch = curl_init(OLLAMA_HOST . '/api/tags');
  61. curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 5]);
  62. $resp = curl_exec($ch);
  63. $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  64. $err = curl_error($ch);
  65. curl_close($ch);
  66. if ($err || $code !== 200) {
  67. echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n";
  68. exit(1);
  69. }
  70. $models = json_decode($resp, true);
  71. $names = array_column($models['models'] ?? [], 'name');
  72. echo "OK: Ollama reachable. Models installed:\n";
  73. foreach ($names as $name) echo " - $name\n";
  74. $embedFound = false;
  75. foreach ($names as $n) {
  76. if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; }
  77. }
  78. if (!$embedFound) {
  79. echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n";
  80. echo "Run on your Ollama server: ollama pull " . EMBED_MODEL . "\n\n";
  81. }
  82. // 2. Test embedding
  83. echo "\nTesting embedding endpoint ...\n";
  84. [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence");
  85. echo "HTTP code: $httpCode\n";
  86. echo "API used: $apiUsed\n";
  87. if ($embedding !== null) {
  88. echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n";
  89. echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n";
  90. } else {
  91. echo "FAIL: No embedding returned.\n";
  92. echo "Raw response: $rawResp\n";
  93. echo "\nPossible fixes:\n";
  94. echo " 1. Run: ollama pull " . EMBED_MODEL . "\n";
  95. echo " 2. Check Ollama version: ollama --version (need >= 0.1.20)\n";
  96. echo " 3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n";
  97. }
  98. exit(0);
  99. }
  100. // ── List mode ────────────────────────────────────────────────────────────────
  101. if (isset($opts['list'])) {
  102. $stmt = $pdo->query(
  103. "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at
  104. FROM knowledge_chunks GROUP BY source, author ORDER BY source"
  105. );
  106. $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
  107. if (!$rows) {
  108. echo "No sources indexed yet.\n";
  109. } else {
  110. printf("%-55s %-25s %6s %s\n", 'Source', 'Author', 'Chunks', 'Indexed');
  111. echo str_repeat('-', 100) . "\n";
  112. foreach ($rows as $r) {
  113. printf("%-55s %-25s %6d %s\n",
  114. substr($r['source'], 0, 54),
  115. substr($r['author'], 0, 24),
  116. $r['chunks'],
  117. $r['indexed_at']
  118. );
  119. }
  120. }
  121. exit(0);
  122. }
  123. // ── Clear mode ───────────────────────────────────────────────────────────────
  124. if (!empty($opts['clear'])) {
  125. $title = $opts['clear'];
  126. $stmt = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  127. $stmt->execute([$title]);
  128. $count = (int)$stmt->fetchColumn();
  129. if ($count === 0) {
  130. echo "No chunks found for source: $title\n";
  131. exit(0);
  132. }
  133. $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?');
  134. $del->execute([$title]);
  135. echo "Deleted $count chunks for: $title\n";
  136. exit(0);
  137. }
  138. // ── Collect PDF + EPUB files ──────────────────────────────────────────────────
  139. $files = [];
  140. $author = trim($opts['author'] ?? 'Unknown');
  141. if (!empty($opts['file'])) {
  142. $path = $opts['file'];
  143. if (!is_file($path)) {
  144. die("File not found: $path\n");
  145. }
  146. $files[] = $path;
  147. }
  148. if (!empty($opts['dir'])) {
  149. $dir = rtrim($opts['dir'], '/\\');
  150. if (!is_dir($dir)) {
  151. die("Directory not found: $dir\n");
  152. }
  153. $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
  154. foreach ($it as $f) {
  155. if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) {
  156. $files[] = $f->getPathname();
  157. }
  158. }
  159. if (!$files) {
  160. die("No PDF or EPUB files found in: $dir\n");
  161. }
  162. }
  163. echo "Found " . count($files) . " file(s) to ingest.\n\n";
  164. // ── Process each file ────────────────────────────────────────────────────────
  165. $pdfParser = new Parser();
  166. foreach ($files as $filePath) {
  167. $source = pathinfo($filePath, PATHINFO_FILENAME);
  168. $ext = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
  169. echo "Processing: $source ($ext)\n";
  170. // Check if already indexed
  171. $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
  172. $chk->execute([$source]);
  173. if ((int)$chk->fetchColumn() > 0) {
  174. echo " Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n";
  175. continue;
  176. }
  177. // Extract pages as array of ['page' => int, 'text' => string]
  178. try {
  179. if ($ext === 'epub') {
  180. $pages = extractEpubPages($filePath);
  181. } else {
  182. $pages = extractPdfPages($pdfParser, $filePath);
  183. }
  184. } catch (Exception $e) {
  185. echo " ERROR extracting text: " . $e->getMessage() . "\n\n";
  186. continue;
  187. }
  188. echo " Sections/pages: " . count($pages) . "\n";
  189. $insertStmt = $pdo->prepare(
  190. 'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
  191. VALUES (?, ?, ?, ?, ?, ?)'
  192. );
  193. $totalChunks = 0;
  194. $chunkIndex = 0;
  195. $wordBuffer = [];
  196. $bufferPages = [];
  197. foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) {
  198. if (strlen($pageText) < 50) continue;
  199. $words = explode(' ', $pageText);
  200. foreach ($words as $word) {
  201. $wordBuffer[] = $word;
  202. $bufferPages[] = $pageNum;
  203. }
  204. // Flush when buffer reaches chunk size
  205. while (count($wordBuffer) >= CHUNK_WORDS) {
  206. $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
  207. $chunkText = implode(' ', $chunkWords);
  208. $chunkPage = $bufferPages[0];
  209. if (strlen(trim($chunkText)) > 50) {
  210. $embedding = getEmbedding($chunkText);
  211. if ($embedding === null) {
  212. echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
  213. } else {
  214. $insertStmt->execute([
  215. $source, $author, $chunkPage, $chunkIndex,
  216. $chunkText, json_encode($embedding),
  217. ]);
  218. $chunkIndex++;
  219. $totalChunks++;
  220. }
  221. }
  222. // Slide window with overlap
  223. $step = CHUNK_WORDS - OVERLAP_WORDS;
  224. $wordBuffer = array_slice($wordBuffer, $step);
  225. $bufferPages = array_slice($bufferPages, $step);
  226. if ($chunkIndex % 20 === 0 && $chunkIndex > 0) {
  227. echo " ...{$chunkIndex} chunks embedded\n";
  228. }
  229. }
  230. }
  231. // Flush remaining words as final chunk
  232. if (count($wordBuffer) > 30) {
  233. $chunkText = implode(' ', $wordBuffer);
  234. $embedding = getEmbedding($chunkText);
  235. if ($embedding !== null) {
  236. $insertStmt->execute([
  237. $source, $author, $bufferPages[0] ?? 0, $chunkIndex,
  238. $chunkText, json_encode($embedding),
  239. ]);
  240. $chunkIndex++;
  241. $totalChunks++;
  242. }
  243. }
  244. echo " Done: $totalChunks chunks stored.\n\n";
  245. }
  246. echo "Ingestion complete.\n";
  247. exit(0);
  248. // ── Helpers ──────────────────────────────────────────────────────────────────
  249. /**
  250. * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc.
  251. */
  252. function cleanText(string $text): string
  253. {
  254. // Common PDF ligature replacements
  255. $ligatures = [
  256. 'fi' => 'fi', 'fl' => 'fl', 'ff' => 'ff',
  257. 'ffi' => 'ffi', 'ffl' => 'ffl', 'ſt' => 'st',
  258. ];
  259. $text = strtr($text, $ligatures);
  260. // Collapse multiple spaces / newlines into single space
  261. $text = preg_replace('/\s+/', ' ', $text);
  262. // Remove non-printable characters except newlines
  263. $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text);
  264. return trim($text);
  265. }
  266. /**
  267. * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string].
  268. */
  269. function extractPdfPages(Parser $parser, string $filePath): array
  270. {
  271. $pdf = $parser->parseFile($filePath);
  272. $result = [];
  273. foreach ($pdf->getPages() as $pageNum => $page) {
  274. $text = cleanText($page->getText());
  275. if (strlen($text) >= 50) {
  276. $result[] = ['page' => $pageNum + 1, 'text' => $text];
  277. }
  278. }
  279. return $result;
  280. }
  281. /**
  282. * Extract chapters/sections from an EPUB as pages.
  283. * Returns array of ['page' => int, 'text' => string].
  284. *
  285. * EPUBs are ZIP archives containing XHTML spine items. We:
  286. * 1. Parse META-INF/container.xml to find the OPF file
  287. * 2. Parse the OPF manifest + spine for reading order
  288. * 3. Strip HTML tags from each spine XHTML file
  289. */
  290. function extractEpubPages(string $filePath): array
  291. {
  292. $zip = new ZipArchive();
  293. if ($zip->open($filePath) !== true) {
  294. throw new RuntimeException("Cannot open EPUB file: $filePath");
  295. }
  296. // 1. Locate OPF via container.xml
  297. $containerXml = $zip->getFromName('META-INF/container.xml');
  298. if ($containerXml === false) {
  299. $zip->close();
  300. throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB");
  301. }
  302. $dom = new DOMDocument();
  303. @$dom->loadXML($containerXml);
  304. $xp = new DOMXPath($dom);
  305. $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container');
  306. $nodes = $xp->query('//c:rootfile/@full-path');
  307. if (!$nodes || $nodes->length === 0) {
  308. $zip->close();
  309. throw new RuntimeException("Cannot find OPF path in container.xml");
  310. }
  311. $opfPath = $nodes->item(0)->nodeValue;
  312. $opfDir = dirname($opfPath);
  313. if ($opfDir === '.') $opfDir = '';
  314. // 2. Parse OPF for spine order
  315. $opfXml = $zip->getFromName($opfPath);
  316. if ($opfXml === false) {
  317. $zip->close();
  318. throw new RuntimeException("Cannot read OPF file: $opfPath");
  319. }
  320. $opfDom = new DOMDocument();
  321. @$opfDom->loadXML($opfXml);
  322. $opfXp = new DOMXPath($opfDom);
  323. // Build manifest: id -> href
  324. $manifest = [];
  325. foreach ($opfXp->query('//*[local-name()="item"]') as $item) {
  326. $manifest[$item->getAttribute('id')] = $item->getAttribute('href');
  327. }
  328. // Spine: ordered list of idrefs
  329. $spineHrefs = [];
  330. foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) {
  331. $idref = $ref->getAttribute('idref');
  332. if (isset($manifest[$idref])) {
  333. $spineHrefs[] = $manifest[$idref];
  334. }
  335. }
  336. // 3. Extract text from each spine item
  337. $result = [];
  338. foreach ($spineHrefs as $i => $href) {
  339. $href = urldecode($href);
  340. if (($pos = strpos($href, '#')) !== false) {
  341. $href = substr($href, 0, $pos);
  342. }
  343. $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href;
  344. $html = $zip->getFromName($fullPath);
  345. if ($html === false) continue;
  346. // Strip tags and decode HTML entities
  347. $text = strip_tags($html);
  348. $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
  349. $text = cleanText($text);
  350. if (strlen($text) >= 50) {
  351. $result[] = ['page' => $i + 1, 'text' => $text];
  352. }
  353. }
  354. $zip->close();
  355. if (empty($result)) {
  356. throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected");
  357. }
  358. return $result;
  359. }
  360. /**
  361. * Call Ollama to embed text. Tries the newer /api/embed endpoint first
  362. * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
  363. * then falls back to the legacy /api/embeddings (uses "prompt" key,
  364. * returns "embedding" array). Returns float[] or null on failure.
  365. */
  366. function getEmbedding(string $text): ?array
  367. {
  368. [$embedding] = getEmbeddingDebug($text);
  369. return $embedding;
  370. }
  371. /**
  372. * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode]
  373. * for diagnostic output.
  374. */
  375. function getEmbeddingDebug(string $text): array
  376. {
  377. // ── Try new API: POST /api/embed {"model":..., "input":...} ────────────
  378. $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]);
  379. $ch = curl_init(OLLAMA_HOST . '/api/embed');
  380. curl_setopt_array($ch, [
  381. CURLOPT_POST => true,
  382. CURLOPT_POSTFIELDS => $payload,
  383. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  384. CURLOPT_RETURNTRANSFER => true,
  385. CURLOPT_TIMEOUT => 30,
  386. CURLOPT_CONNECTTIMEOUT => 5,
  387. ]);
  388. $response = curl_exec($ch);
  389. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  390. curl_close($ch);
  391. if ($response && $httpCode === 200) {
  392. $data = json_decode($response, true);
  393. // New API returns { "embeddings": [[...]] }
  394. $emb = $data['embeddings'][0] ?? null;
  395. if (is_array($emb) && count($emb) > 0) {
  396. return [$emb, '/api/embed (new)', $response, $httpCode];
  397. }
  398. }
  399. // ── Fallback: legacy /api/embeddings {"model":..., "prompt":...} ───────
  400. $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]);
  401. $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
  402. curl_setopt_array($ch, [
  403. CURLOPT_POST => true,
  404. CURLOPT_POSTFIELDS => $payload,
  405. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  406. CURLOPT_RETURNTRANSFER => true,
  407. CURLOPT_TIMEOUT => 30,
  408. CURLOPT_CONNECTTIMEOUT => 5,
  409. ]);
  410. $response2 = curl_exec($ch);
  411. $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  412. curl_close($ch);
  413. if ($response2 && $httpCode2 === 200) {
  414. $data2 = json_decode($response2, true);
  415. // Legacy API returns { "embedding": [...] }
  416. $emb2 = $data2['embedding'] ?? null;
  417. if (is_array($emb2) && count($emb2) > 0) {
  418. return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2];
  419. }
  420. }
  421. // Return last response for diagnostics
  422. $lastResp = $response2 ?: $response ?: '';
  423. $lastCode = $httpCode2 ?: $httpCode;
  424. return [null, 'both failed', $lastResp, $lastCode];
  425. }