vor 2 Monaten · ae3970d7ca
--- a/books/010146.albrecht.animal.health.pdf
+++ b/books/010146.albrecht.animal.health.pdf
--- a/books/Albrecht
+++ b/books/Albrecht
--- a/books/albrecht-soil-fertility-and-animal-health_vol-II.pdf
+++ b/books/albrecht-soil-fertility-and-animal-health_vol-II.pdf
--- a/books/science-in-agriculture-advanced-methods-for-sustainable-andersen-arden-b-2nd-ed-austin-tx-2000-acres-u-s-a-inc-9780911311358-14b7831ccc09e53fbbe693bf7f94e23d-annas-archive_compress.pdf
+++ b/books/science-in-agriculture-advanced-methods-for-sustainable-andersen-arden-b-2nd-ed-austin-tx-2000-acres-u-s-a-inc-9780911311358-14b7831ccc09e53fbbe693bf7f94e23d-annas-archive_compress.pdf
--- a/books/complete-guide-to-the-sustainable-and-profitable-biological-system-of-farming-acres-gary-f-zimmer_compress.pdf
+++ b/books/complete-guide-to-the-sustainable-and-profitable-biological-system-of-farming-acres-gary-f-zimmer_compress.pdf
--- a/dashboard/crop-analysis/updatecomment.php
+++ b/dashboard/crop-analysis/updatecomment.php
@@ -13,8 +13,8 @@ if (session_status() === PHP_SESSION_NONE) {
 
				     session_start();
			
 
				 }
			
 
				 
			
 
				-require_once __DIR__ . '/../../../config/database.php';
			
 
				-require_once __DIR__ . '/../../../lib/auth.php';
			
 
				+require_once __DIR__ . '/../../config/database.php';
			
 
				+require_once __DIR__ . '/../../lib/auth.php';
			
 
				 
			
 
				 if (!isLoggedIn()) {
			
 
				     http_response_code(403);
			
--- a/tools/ingest_knowledge.php
+++ b/tools/ingest_knowledge.php
@@ -44,15 +44,16 @@ $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help'
 
				 if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
			
 
				     echo <<<HELP
			
 
				 Usage:
			
 
				-  php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
			
 
				-  php tools/ingest_knowledge.php --dir="books/"    --author="Various"
			
 
				+  php tools/ingest_knowledge.php --file="book.pdf"  --author="William A. Albrecht"
			
 
				+  php tools/ingest_knowledge.php --file="book.epub" --author="William A. Albrecht"
			
 
				+  php tools/ingest_knowledge.php --dir="books/"     --author="Various"
			
 
				   php tools/ingest_knowledge.php --list
			
 
				   php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
			
 
				-  php tools/ingest_knowledge.php --test            (verify Ollama connection + embedding)
			
 
				+  php tools/ingest_knowledge.php --test             (verify Ollama connection + embedding)
			
 
				 
			
 
				 Options:
			
 
				-  --file    Path to a single PDF file
			
 
				-  --dir     Path to a directory of PDF files (processed recursively)
			
 
				+  --file    Path to a single PDF or EPUB file
			
 
				+  --dir     Path to a directory of PDF/EPUB files (processed recursively)
			
 
				   --author  Author name to tag all chunks from this run
			
 
				   --list    List all indexed sources with chunk counts
			
 
				   --clear   Remove all chunks from a named source
			
@@ -153,7 +154,7 @@ if (!empty($opts['clear'])) {
 
				     exit(0);
			
 
				 }
			
 
				 
			
 
				-// ── Collect PDF files ────────────────────────────────────────────────────────
			
 
				+// ── Collect PDF + EPUB files ──────────────────────────────────────────────────
			
 
				 $files  = [];
			
 
				 $author = trim($opts['author'] ?? 'Unknown');
			
 
				 
			
@@ -172,23 +173,24 @@ if (!empty($opts['dir'])) {
 
				     }
			
 
				     $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
			
 
				     foreach ($it as $f) {
			
 
				-        if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
			
 
				+        if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) {
			
 
				             $files[] = $f->getPathname();
			
 
				         }
			
 
				     }
			
 
				     if (!$files) {
			
 
				-        die("No PDF files found in: $dir\n");
			
 
				+        die("No PDF or EPUB files found in: $dir\n");
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
			
 
				+echo "Found " . count($files) . " file(s) to ingest.\n\n";
			
 
				 
			
 
				 // ── Process each file ────────────────────────────────────────────────────────
			
 
				-$parser = new Parser();
			
 
				+$pdfParser = new Parser();
			
 
				 
			
 
				 foreach ($files as $filePath) {
			
 
				     $source = pathinfo($filePath, PATHINFO_FILENAME);
			
 
				-    echo "Processing: $source\n";
			
 
				+    $ext    = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
			
 
				+    echo "Processing: $source ($ext)\n";
			
 
				 
			
 
				     // Check if already indexed
			
 
				     $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
			
@@ -198,44 +200,44 @@ foreach ($files as $filePath) {
 
				         continue;
			
 
				     }
			
 
				 
			
 
				+    // Extract pages as array of ['page' => int, 'text' => string]
			
 
				     try {
			
 
				-        $pdf   = $parser->parseFile($filePath);
			
 
				-        $pages = $pdf->getPages();
			
 
				+        if ($ext === 'epub') {
			
 
				+            $pages = extractEpubPages($filePath);
			
 
				+        } else {
			
 
				+            $pages = extractPdfPages($pdfParser, $filePath);
			
 
				+        }
			
 
				     } catch (Exception $e) {
			
 
				-        echo "  ERROR parsing PDF: " . $e->getMessage() . "\n\n";
			
 
				+        echo "  ERROR extracting text: " . $e->getMessage() . "\n\n";
			
 
				         continue;
			
 
				     }
			
 
				 
			
 
				-    echo "  Pages: " . count($pages) . "\n";
			
 
				-
			
 
				-    $totalChunks  = 0;
			
 
				-    $totalTokens  = 0;
			
 
				-    $pageBuffer   = [];  // accumulate pages into a rolling word buffer
			
 
				+    echo "  Sections/pages: " . count($pages) . "\n";
			
 
				 
			
 
				     $insertStmt = $pdo->prepare(
			
 
				         'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
			
 
				          VALUES (?, ?, ?, ?, ?, ?)'
			
 
				     );
			
 
				 
			
 
				+    $totalChunks = 0;
			
 
				     $chunkIndex  = 0;
			
 
				     $wordBuffer  = [];
			
 
				-    $bufferPages = [];  // page numbers corresponding to words in buffer
			
 
				+    $bufferPages = [];
			
 
				 
			
 
				-    foreach ($pages as $pageNum => $page) {
			
 
				-        $pageText = cleanText($page->getText());
			
 
				-        if (strlen($pageText) < 50) continue;  // skip blank/image-only pages
			
 
				+    foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) {
			
 
				+        if (strlen($pageText) < 50) continue;
			
 
				 
			
 
				         $words = explode(' ', $pageText);
			
 
				         foreach ($words as $word) {
			
 
				             $wordBuffer[]  = $word;
			
 
				-            $bufferPages[] = $pageNum + 1;
			
 
				+            $bufferPages[] = $pageNum;
			
 
				         }
			
 
				 
			
 
				         // Flush when buffer reaches chunk size
			
 
				         while (count($wordBuffer) >= CHUNK_WORDS) {
			
 
				-            $chunkWords  = array_slice($wordBuffer, 0, CHUNK_WORDS);
			
 
				-            $chunkText   = implode(' ', $chunkWords);
			
 
				-            $chunkPage   = $bufferPages[0];
			
 
				+            $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
			
 
				+            $chunkText  = implode(' ', $chunkWords);
			
 
				+            $chunkPage  = $bufferPages[0];
			
 
				 
			
 
				             if (strlen(trim($chunkText)) > 50) {
			
 
				                 $embedding = getEmbedding($chunkText);
			
@@ -243,12 +245,8 @@ foreach ($files as $filePath) {
 
				                     echo "  WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
			
 
				                 } else {
			
 
				                     $insertStmt->execute([
			
 
				-                        $source,
			
 
				-                        $author,
			
 
				-                        $chunkPage,
			
 
				-                        $chunkIndex,
			
 
				-                        $chunkText,
			
 
				-                        json_encode($embedding),
			
 
				+                        $source, $author, $chunkPage, $chunkIndex,
			
 
				+                        $chunkText, json_encode($embedding),
			
 
				                     ]);
			
 
				                     $chunkIndex++;
			
 
				                     $totalChunks++;
			
@@ -309,6 +307,114 @@ function cleanText(string $text): string
 
				     return trim($text);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string].
			
 
				+ */
			
 
				+function extractPdfPages(Parser $parser, string $filePath): array
			
 
				+{
			
 
				+    $pdf    = $parser->parseFile($filePath);
			
 
				+    $result = [];
			
 
				+    foreach ($pdf->getPages() as $pageNum => $page) {
			
 
				+        $text = cleanText($page->getText());
			
 
				+        if (strlen($text) >= 50) {
			
 
				+            $result[] = ['page' => $pageNum + 1, 'text' => $text];
			
 
				+        }
			
 
				+    }
			
 
				+    return $result;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Extract chapters/sections from an EPUB as pages.
			
 
				+ * Returns array of ['page' => int, 'text' => string].
			
 
				+ *
			
 
				+ * EPUBs are ZIP archives containing XHTML spine items. We:
			
 
				+ *  1. Parse META-INF/container.xml to find the OPF file
			
 
				+ *  2. Parse the OPF manifest + spine for reading order
			
 
				+ *  3. Strip HTML tags from each spine XHTML file
			
 
				+ */
			
 
				+function extractEpubPages(string $filePath): array
			
 
				+{
			
 
				+    $zip = new ZipArchive();
			
 
				+    if ($zip->open($filePath) !== true) {
			
 
				+        throw new RuntimeException("Cannot open EPUB file: $filePath");
			
 
				+    }
			
 
				+
			
 
				+    // 1. Locate OPF via container.xml
			
 
				+    $containerXml = $zip->getFromName('META-INF/container.xml');
			
 
				+    if ($containerXml === false) {
			
 
				+        $zip->close();
			
 
				+        throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB");
			
 
				+    }
			
 
				+
			
 
				+    $dom = new DOMDocument();
			
 
				+    @$dom->loadXML($containerXml);
			
 
				+    $xp = new DOMXPath($dom);
			
 
				+    $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container');
			
 
				+    $nodes = $xp->query('//c:rootfile/@full-path');
			
 
				+    if (!$nodes || $nodes->length === 0) {
			
 
				+        $zip->close();
			
 
				+        throw new RuntimeException("Cannot find OPF path in container.xml");
			
 
				+    }
			
 
				+    $opfPath = $nodes->item(0)->nodeValue;
			
 
				+    $opfDir  = dirname($opfPath);
			
 
				+    if ($opfDir === '.') $opfDir = '';
			
 
				+
			
 
				+    // 2. Parse OPF for spine order
			
 
				+    $opfXml = $zip->getFromName($opfPath);
			
 
				+    if ($opfXml === false) {
			
 
				+        $zip->close();
			
 
				+        throw new RuntimeException("Cannot read OPF file: $opfPath");
			
 
				+    }
			
 
				+
			
 
				+    $opfDom = new DOMDocument();
			
 
				+    @$opfDom->loadXML($opfXml);
			
 
				+    $opfXp = new DOMXPath($opfDom);
			
 
				+
			
 
				+    // Build manifest: id -> href
			
 
				+    $manifest = [];
			
 
				+    foreach ($opfXp->query('//*[local-name()="item"]') as $item) {
			
 
				+        $manifest[$item->getAttribute('id')] = $item->getAttribute('href');
			
 
				+    }
			
 
				+
			
 
				+    // Spine: ordered list of idrefs
			
 
				+    $spineHrefs = [];
			
 
				+    foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) {
			
 
				+        $idref = $ref->getAttribute('idref');
			
 
				+        if (isset($manifest[$idref])) {
			
 
				+            $spineHrefs[] = $manifest[$idref];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // 3. Extract text from each spine item
			
 
				+    $result = [];
			
 
				+    foreach ($spineHrefs as $i => $href) {
			
 
				+        $href = urldecode($href);
			
 
				+        if (($pos = strpos($href, '#')) !== false) {
			
 
				+            $href = substr($href, 0, $pos);
			
 
				+        }
			
 
				+        $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href;
			
 
				+        $html = $zip->getFromName($fullPath);
			
 
				+        if ($html === false) continue;
			
 
				+
			
 
				+        // Strip tags and decode HTML entities
			
 
				+        $text = strip_tags($html);
			
 
				+        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
			
 
				+        $text = cleanText($text);
			
 
				+
			
 
				+        if (strlen($text) >= 50) {
			
 
				+            $result[] = ['page' => $i + 1, 'text' => $text];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    $zip->close();
			
 
				+
			
 
				+    if (empty($result)) {
			
 
				+        throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected");
			
 
				+    }
			
 
				+
			
 
				+    return $result;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * Call Ollama to embed text. Tries the newer /api/embed endpoint first
			
 
				  * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),