|
|
@@ -44,15 +44,16 @@ $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help'
|
|
|
if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
|
|
|
echo <<<HELP
|
|
|
Usage:
|
|
|
- php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
|
|
|
- php tools/ingest_knowledge.php --dir="books/" --author="Various"
|
|
|
+ php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
|
|
|
+ php tools/ingest_knowledge.php --file="book.epub" --author="William A. Albrecht"
|
|
|
+ php tools/ingest_knowledge.php --dir="books/" --author="Various"
|
|
|
php tools/ingest_knowledge.php --list
|
|
|
php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
|
|
|
- php tools/ingest_knowledge.php --test (verify Ollama connection + embedding)
|
|
|
+ php tools/ingest_knowledge.php --test (verify Ollama connection + embedding)
|
|
|
|
|
|
Options:
|
|
|
- --file Path to a single PDF file
|
|
|
- --dir Path to a directory of PDF files (processed recursively)
|
|
|
+ --file Path to a single PDF or EPUB file
|
|
|
+ --dir Path to a directory of PDF/EPUB files (processed recursively)
|
|
|
--author Author name to tag all chunks from this run
|
|
|
--list List all indexed sources with chunk counts
|
|
|
--clear Remove all chunks from a named source
|
|
|
@@ -153,7 +154,7 @@ if (!empty($opts['clear'])) {
|
|
|
exit(0);
|
|
|
}
|
|
|
|
|
|
-// ── Collect PDF files ────────────────────────────────────────────────────────
|
|
|
+// ── Collect PDF + EPUB files ──────────────────────────────────────────────────
|
|
|
$files = [];
|
|
|
$author = trim($opts['author'] ?? 'Unknown');
|
|
|
|
|
|
@@ -172,23 +173,24 @@ if (!empty($opts['dir'])) {
|
|
|
}
|
|
|
$it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
|
|
|
foreach ($it as $f) {
|
|
|
- if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
|
|
|
+ if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) {
|
|
|
$files[] = $f->getPathname();
|
|
|
}
|
|
|
}
|
|
|
if (!$files) {
|
|
|
- die("No PDF files found in: $dir\n");
|
|
|
+ die("No PDF or EPUB files found in: $dir\n");
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
|
|
|
+echo "Found " . count($files) . " file(s) to ingest.\n\n";
|
|
|
|
|
|
// ── Process each file ────────────────────────────────────────────────────────
|
|
|
-$parser = new Parser();
|
|
|
+$pdfParser = new Parser();
|
|
|
|
|
|
foreach ($files as $filePath) {
|
|
|
$source = pathinfo($filePath, PATHINFO_FILENAME);
|
|
|
- echo "Processing: $source\n";
|
|
|
+ $ext = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
|
|
|
+ echo "Processing: $source ($ext)\n";
|
|
|
|
|
|
// Check if already indexed
|
|
|
$chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
|
|
|
@@ -198,44 +200,44 @@ foreach ($files as $filePath) {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
+ // Extract pages as array of ['page' => int, 'text' => string]
|
|
|
try {
|
|
|
- $pdf = $parser->parseFile($filePath);
|
|
|
- $pages = $pdf->getPages();
|
|
|
+ if ($ext === 'epub') {
|
|
|
+ $pages = extractEpubPages($filePath);
|
|
|
+ } else {
|
|
|
+ $pages = extractPdfPages($pdfParser, $filePath);
|
|
|
+ }
|
|
|
} catch (Exception $e) {
|
|
|
- echo " ERROR parsing PDF: " . $e->getMessage() . "\n\n";
|
|
|
+ echo " ERROR extracting text: " . $e->getMessage() . "\n\n";
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
- echo " Pages: " . count($pages) . "\n";
|
|
|
-
|
|
|
- $totalChunks = 0;
|
|
|
- $totalTokens = 0;
|
|
|
- $pageBuffer = []; // accumulate pages into a rolling word buffer
|
|
|
+ echo " Sections/pages: " . count($pages) . "\n";
|
|
|
|
|
|
$insertStmt = $pdo->prepare(
|
|
|
'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
|
|
|
VALUES (?, ?, ?, ?, ?, ?)'
|
|
|
);
|
|
|
|
|
|
+ $totalChunks = 0;
|
|
|
$chunkIndex = 0;
|
|
|
$wordBuffer = [];
|
|
|
- $bufferPages = []; // page numbers corresponding to words in buffer
|
|
|
+ $bufferPages = [];
|
|
|
|
|
|
- foreach ($pages as $pageNum => $page) {
|
|
|
- $pageText = cleanText($page->getText());
|
|
|
- if (strlen($pageText) < 50) continue; // skip blank/image-only pages
|
|
|
+ foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) {
|
|
|
+ if (strlen($pageText) < 50) continue;
|
|
|
|
|
|
$words = explode(' ', $pageText);
|
|
|
foreach ($words as $word) {
|
|
|
$wordBuffer[] = $word;
|
|
|
- $bufferPages[] = $pageNum + 1;
|
|
|
+ $bufferPages[] = $pageNum;
|
|
|
}
|
|
|
|
|
|
// Flush when buffer reaches chunk size
|
|
|
while (count($wordBuffer) >= CHUNK_WORDS) {
|
|
|
- $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
|
|
|
- $chunkText = implode(' ', $chunkWords);
|
|
|
- $chunkPage = $bufferPages[0];
|
|
|
+ $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
|
|
|
+ $chunkText = implode(' ', $chunkWords);
|
|
|
+ $chunkPage = $bufferPages[0];
|
|
|
|
|
|
if (strlen(trim($chunkText)) > 50) {
|
|
|
$embedding = getEmbedding($chunkText);
|
|
|
@@ -243,12 +245,8 @@ foreach ($files as $filePath) {
|
|
|
echo " WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
|
|
|
} else {
|
|
|
$insertStmt->execute([
|
|
|
- $source,
|
|
|
- $author,
|
|
|
- $chunkPage,
|
|
|
- $chunkIndex,
|
|
|
- $chunkText,
|
|
|
- json_encode($embedding),
|
|
|
+ $source, $author, $chunkPage, $chunkIndex,
|
|
|
+ $chunkText, json_encode($embedding),
|
|
|
]);
|
|
|
$chunkIndex++;
|
|
|
$totalChunks++;
|
|
|
@@ -309,6 +307,114 @@ function cleanText(string $text): string
|
|
|
return trim($text);
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string].
|
|
|
+ */
|
|
|
+function extractPdfPages(Parser $parser, string $filePath): array
|
|
|
+{
|
|
|
+ $pdf = $parser->parseFile($filePath);
|
|
|
+ $result = [];
|
|
|
+ foreach ($pdf->getPages() as $pageNum => $page) {
|
|
|
+ $text = cleanText($page->getText());
|
|
|
+ if (strlen($text) >= 50) {
|
|
|
+ $result[] = ['page' => $pageNum + 1, 'text' => $text];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return $result;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Extract chapters/sections from an EPUB as pages.
|
|
|
+ * Returns array of ['page' => int, 'text' => string].
|
|
|
+ *
|
|
|
+ * EPUBs are ZIP archives containing XHTML spine items. We:
|
|
|
+ * 1. Parse META-INF/container.xml to find the OPF file
|
|
|
+ * 2. Parse the OPF manifest + spine for reading order
|
|
|
+ * 3. Strip HTML tags from each spine XHTML file
|
|
|
+ */
|
|
|
+function extractEpubPages(string $filePath): array
|
|
|
+{
|
|
|
+ $zip = new ZipArchive();
|
|
|
+ if ($zip->open($filePath) !== true) {
|
|
|
+ throw new RuntimeException("Cannot open EPUB file: $filePath");
|
|
|
+ }
|
|
|
+
|
|
|
+ // 1. Locate OPF via container.xml
|
|
|
+ $containerXml = $zip->getFromName('META-INF/container.xml');
|
|
|
+ if ($containerXml === false) {
|
|
|
+ $zip->close();
|
|
|
+ throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB");
|
|
|
+ }
|
|
|
+
|
|
|
+ $dom = new DOMDocument();
|
|
|
+ @$dom->loadXML($containerXml);
|
|
|
+ $xp = new DOMXPath($dom);
|
|
|
+ $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container');
|
|
|
+ $nodes = $xp->query('//c:rootfile/@full-path');
|
|
|
+ if (!$nodes || $nodes->length === 0) {
|
|
|
+ $zip->close();
|
|
|
+ throw new RuntimeException("Cannot find OPF path in container.xml");
|
|
|
+ }
|
|
|
+ $opfPath = $nodes->item(0)->nodeValue;
|
|
|
+ $opfDir = dirname($opfPath);
|
|
|
+ if ($opfDir === '.') $opfDir = '';
|
|
|
+
|
|
|
+ // 2. Parse OPF for spine order
|
|
|
+ $opfXml = $zip->getFromName($opfPath);
|
|
|
+ if ($opfXml === false) {
|
|
|
+ $zip->close();
|
|
|
+ throw new RuntimeException("Cannot read OPF file: $opfPath");
|
|
|
+ }
|
|
|
+
|
|
|
+ $opfDom = new DOMDocument();
|
|
|
+ @$opfDom->loadXML($opfXml);
|
|
|
+ $opfXp = new DOMXPath($opfDom);
|
|
|
+
|
|
|
+ // Build manifest: id -> href
|
|
|
+ $manifest = [];
|
|
|
+ foreach ($opfXp->query('//*[local-name()="item"]') as $item) {
|
|
|
+ $manifest[$item->getAttribute('id')] = $item->getAttribute('href');
|
|
|
+ }
|
|
|
+
|
|
|
+ // Spine: ordered list of idrefs
|
|
|
+ $spineHrefs = [];
|
|
|
+ foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) {
|
|
|
+ $idref = $ref->getAttribute('idref');
|
|
|
+ if (isset($manifest[$idref])) {
|
|
|
+ $spineHrefs[] = $manifest[$idref];
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 3. Extract text from each spine item
|
|
|
+ $result = [];
|
|
|
+ foreach ($spineHrefs as $i => $href) {
|
|
|
+ $href = urldecode($href);
|
|
|
+ if (($pos = strpos($href, '#')) !== false) {
|
|
|
+ $href = substr($href, 0, $pos);
|
|
|
+ }
|
|
|
+ $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href;
|
|
|
+ $html = $zip->getFromName($fullPath);
|
|
|
+ if ($html === false) continue;
|
|
|
+
|
|
|
+ // Strip tags and decode HTML entities
|
|
|
+ $text = strip_tags($html);
|
|
|
+ $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
|
+ $text = cleanText($text);
|
|
|
+
|
|
|
+ if (strlen($text) >= 50) {
|
|
|
+ $result[] = ['page' => $i + 1, 'text' => $text];
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ $zip->close();
|
|
|
+
|
|
|
+ if (empty($result)) {
|
|
|
+ throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected");
|
|
|
+ }
|
|
|
+
|
|
|
+ return $result;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* Call Ollama to embed text. Tries the newer /api/embed endpoint first
|
|
|
* (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
|