Benjamin Harris vor 2 Monaten
Ursprung
Commit
ae3970d7ca

BIN
books/010146.albrecht.animal.health.pdf


BIN
books/Albrecht on Calcium.pdf


BIN
books/albrecht-soil-fertility-and-animal-health_vol-II.pdf


+ 0 - 0
books/science-in-agriculture-advanced-methods-for-sustainable-andersen-arden-b-2nd-ed-austin-tx-2000-acres-u-s-a-inc-9780911311358-14b7831ccc09e53fbbe693bf7f94e23d-annas-archive_compress.pdf → books/science-in-agriculture-advanced-methods-for-sustainable-farming-arden-andersen.pdf


+ 0 - 0
books/complete-guide-to-the-sustainable-and-profitable-biological-system-of-farming-acres-gary-f-zimmer_compress.pdf → books/the-biological-farmer-gary-f-zimmer.pdf


+ 2 - 2
dashboard/crop-analysis/updatecomment.php

@@ -13,8 +13,8 @@ if (session_status() === PHP_SESSION_NONE) {
     session_start();
 }
 
-require_once __DIR__ . '/../../../config/database.php';
-require_once __DIR__ . '/../../../lib/auth.php';
+require_once __DIR__ . '/../../config/database.php';
+require_once __DIR__ . '/../../lib/auth.php';
 
 if (!isLoggedIn()) {
     http_response_code(403);

+ 139 - 33
tools/ingest_knowledge.php

@@ -44,15 +44,16 @@ $opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help'
 if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
     echo <<<HELP
 Usage:
-  php tools/ingest_knowledge.php --file="book.pdf" --author="William A. Albrecht"
-  php tools/ingest_knowledge.php --dir="books/"    --author="Various"
+  php tools/ingest_knowledge.php --file="book.pdf"  --author="William A. Albrecht"
+  php tools/ingest_knowledge.php --file="book.epub" --author="William A. Albrecht"
+  php tools/ingest_knowledge.php --dir="books/"     --author="Various"
   php tools/ingest_knowledge.php --list
   php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
-  php tools/ingest_knowledge.php --test            (verify Ollama connection + embedding)
+  php tools/ingest_knowledge.php --test             (verify Ollama connection + embedding)
 
 Options:
-  --file    Path to a single PDF file
-  --dir     Path to a directory of PDF files (processed recursively)
+  --file    Path to a single PDF or EPUB file
+  --dir     Path to a directory of PDF/EPUB files (processed recursively)
   --author  Author name to tag all chunks from this run
   --list    List all indexed sources with chunk counts
   --clear   Remove all chunks from a named source
@@ -153,7 +154,7 @@ if (!empty($opts['clear'])) {
     exit(0);
 }
 
-// ── Collect PDF files ────────────────────────────────────────────────────────
+// ── Collect PDF + EPUB files ──────────────────────────────────────────────────
 $files  = [];
 $author = trim($opts['author'] ?? 'Unknown');
 
@@ -172,23 +173,24 @@ if (!empty($opts['dir'])) {
     }
     $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
     foreach ($it as $f) {
-        if ($f->isFile() && strtolower($f->getExtension()) === 'pdf') {
+        if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) {
             $files[] = $f->getPathname();
         }
     }
     if (!$files) {
-        die("No PDF files found in: $dir\n");
+        die("No PDF or EPUB files found in: $dir\n");
     }
 }
 
-echo "Found " . count($files) . " PDF file(s) to ingest.\n\n";
+echo "Found " . count($files) . " file(s) to ingest.\n\n";
 
 // ── Process each file ────────────────────────────────────────────────────────
-$parser = new Parser();
+$pdfParser = new Parser();
 
 foreach ($files as $filePath) {
     $source = pathinfo($filePath, PATHINFO_FILENAME);
-    echo "Processing: $source\n";
+    $ext    = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
+    echo "Processing: $source ($ext)\n";
 
     // Check if already indexed
     $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
@@ -198,44 +200,44 @@ foreach ($files as $filePath) {
         continue;
     }
 
+    // Extract pages as array of ['page' => int, 'text' => string]
     try {
-        $pdf   = $parser->parseFile($filePath);
-        $pages = $pdf->getPages();
+        if ($ext === 'epub') {
+            $pages = extractEpubPages($filePath);
+        } else {
+            $pages = extractPdfPages($pdfParser, $filePath);
+        }
     } catch (Exception $e) {
-        echo "  ERROR parsing PDF: " . $e->getMessage() . "\n\n";
+        echo "  ERROR extracting text: " . $e->getMessage() . "\n\n";
         continue;
     }
 
-    echo "  Pages: " . count($pages) . "\n";
-
-    $totalChunks  = 0;
-    $totalTokens  = 0;
-    $pageBuffer   = [];  // accumulate pages into a rolling word buffer
+    echo "  Sections/pages: " . count($pages) . "\n";
 
     $insertStmt = $pdo->prepare(
         'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
          VALUES (?, ?, ?, ?, ?, ?)'
     );
 
+    $totalChunks = 0;
     $chunkIndex  = 0;
     $wordBuffer  = [];
-    $bufferPages = [];  // page numbers corresponding to words in buffer
+    $bufferPages = [];
 
-    foreach ($pages as $pageNum => $page) {
-        $pageText = cleanText($page->getText());
-        if (strlen($pageText) < 50) continue;  // skip blank/image-only pages
+    foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) {
+        if (strlen($pageText) < 50) continue;
 
         $words = explode(' ', $pageText);
         foreach ($words as $word) {
             $wordBuffer[]  = $word;
-            $bufferPages[] = $pageNum + 1;
+            $bufferPages[] = $pageNum;
         }
 
         // Flush when buffer reaches chunk size
         while (count($wordBuffer) >= CHUNK_WORDS) {
-            $chunkWords  = array_slice($wordBuffer, 0, CHUNK_WORDS);
-            $chunkText   = implode(' ', $chunkWords);
-            $chunkPage   = $bufferPages[0];
+            $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
+            $chunkText  = implode(' ', $chunkWords);
+            $chunkPage  = $bufferPages[0];
 
             if (strlen(trim($chunkText)) > 50) {
                 $embedding = getEmbedding($chunkText);
@@ -243,12 +245,8 @@ foreach ($files as $filePath) {
                     echo "  WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
                 } else {
                     $insertStmt->execute([
-                        $source,
-                        $author,
-                        $chunkPage,
-                        $chunkIndex,
-                        $chunkText,
-                        json_encode($embedding),
+                        $source, $author, $chunkPage, $chunkIndex,
+                        $chunkText, json_encode($embedding),
                     ]);
                     $chunkIndex++;
                     $totalChunks++;
@@ -309,6 +307,114 @@ function cleanText(string $text): string
     return trim($text);
 }
 
+/**
+ * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string].
+ */
+function extractPdfPages(Parser $parser, string $filePath): array
+{
+    $pdf    = $parser->parseFile($filePath);
+    $result = [];
+    foreach ($pdf->getPages() as $pageNum => $page) {
+        $text = cleanText($page->getText());
+        if (strlen($text) >= 50) {
+            $result[] = ['page' => $pageNum + 1, 'text' => $text];
+        }
+    }
+    return $result;
+}
+
+/**
+ * Extract chapters/sections from an EPUB as pages.
+ * Returns array of ['page' => int, 'text' => string].
+ *
+ * EPUBs are ZIP archives containing XHTML spine items. We:
+ *  1. Parse META-INF/container.xml to find the OPF file
+ *  2. Parse the OPF manifest + spine for reading order
+ *  3. Strip HTML tags from each spine XHTML file
+ */
+function extractEpubPages(string $filePath): array
+{
+    $zip = new ZipArchive();
+    if ($zip->open($filePath) !== true) {
+        throw new RuntimeException("Cannot open EPUB file: $filePath");
+    }
+
+    // 1. Locate OPF via container.xml
+    $containerXml = $zip->getFromName('META-INF/container.xml');
+    if ($containerXml === false) {
+        $zip->close();
+        throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB");
+    }
+
+    $dom = new DOMDocument();
+    @$dom->loadXML($containerXml);
+    $xp = new DOMXPath($dom);
+    $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container');
+    $nodes = $xp->query('//c:rootfile/@full-path');
+    if (!$nodes || $nodes->length === 0) {
+        $zip->close();
+        throw new RuntimeException("Cannot find OPF path in container.xml");
+    }
+    $opfPath = $nodes->item(0)->nodeValue;
+    $opfDir  = dirname($opfPath);
+    if ($opfDir === '.') $opfDir = '';
+
+    // 2. Parse OPF for spine order
+    $opfXml = $zip->getFromName($opfPath);
+    if ($opfXml === false) {
+        $zip->close();
+        throw new RuntimeException("Cannot read OPF file: $opfPath");
+    }
+
+    $opfDom = new DOMDocument();
+    @$opfDom->loadXML($opfXml);
+    $opfXp = new DOMXPath($opfDom);
+
+    // Build manifest: id -> href
+    $manifest = [];
+    foreach ($opfXp->query('//*[local-name()="item"]') as $item) {
+        $manifest[$item->getAttribute('id')] = $item->getAttribute('href');
+    }
+
+    // Spine: ordered list of idrefs
+    $spineHrefs = [];
+    foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) {
+        $idref = $ref->getAttribute('idref');
+        if (isset($manifest[$idref])) {
+            $spineHrefs[] = $manifest[$idref];
+        }
+    }
+
+    // 3. Extract text from each spine item
+    $result = [];
+    foreach ($spineHrefs as $i => $href) {
+        $href = urldecode($href);
+        if (($pos = strpos($href, '#')) !== false) {
+            $href = substr($href, 0, $pos);
+        }
+        $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href;
+        $html = $zip->getFromName($fullPath);
+        if ($html === false) continue;
+
+        // Strip tags and decode HTML entities
+        $text = strip_tags($html);
+        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+        $text = cleanText($text);
+
+        if (strlen($text) >= 50) {
+            $result[] = ['page' => $i + 1, 'text' => $text];
+        }
+    }
+
+    $zip->close();
+
+    if (empty($result)) {
+        throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected");
+    }
+
+    return $result;
+}
+
 /**
  * Call Ollama to embed text. Tries the newer /api/embed endpoint first
  * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),