benjamin.harris
/
soil-report-ai


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
							<?php
/**
 * tools/ingest_knowledge.php
 *
 * CLI script: ingests soil science PDF books into the knowledge_chunks table.
 * Each page is split into overlapping chunks, embedded via Ollama, and stored.
 *
 * Usage:
 *   php tools/ingest_knowledge.php --file="path/to/book.pdf" --author="William A. Albrecht"
 *   php tools/ingest_knowledge.php --dir="path/to/books/" --author="Various"
 *   php tools/ingest_knowledge.php --list          (show all indexed sources)
 *   php tools/ingest_knowledge.php --clear="Book Title"   (remove a source)
 *
 * Requirements:
 *   composer require smalot/pdfparser
 *   Ollama running with nomic-embed-text pulled:
 *     ollama pull nomic-embed-text
 *
 * The embedding model (nomic-embed-text) produces 768-dimensional vectors.
 * Each chunk is ~500 words with a 100-word overlap to preserve context across boundaries.
 */

// ── Must run from CLI ────────────────────────────────────────────────────────
if (PHP_SAPI !== 'cli') {
    die("This script must be run from the command line.\n");
}

define('ROOT', dirname(__DIR__));

require ROOT . '/vendor/autoload.php';
require ROOT . '/config/database.php';

use Smalot\PdfParser\Parser;

// ── Config ───────────────────────────────────────────────────────────────────
define('OLLAMA_HOST',  'http://192.168.8.73:11434');
define('EMBED_MODEL',  'nomic-embed-text');
define('CHUNK_WORDS',  500);   // target words per chunk
define('OVERLAP_WORDS', 80);   // overlap between consecutive chunks

// ── Parse args ───────────────────────────────────────────────────────────────
$opts = getopt('', ['file:', 'dir:', 'author:', 'list', 'clear:', 'test', 'help']);

if (isset($opts['help']) || (empty($opts['file']) && empty($opts['dir']) && !isset($opts['list']) && empty($opts['clear']) && !isset($opts['test']))) {
    echo <<<HELP
Usage:
  php tools/ingest_knowledge.php --file="book.pdf"  --author="William A. Albrecht"
  php tools/ingest_knowledge.php --file="book.epub" --author="William A. Albrecht"
  php tools/ingest_knowledge.php --dir="books/"     --author="Various"
  php tools/ingest_knowledge.php --list
  php tools/ingest_knowledge.php --clear="Soil Fertility and Animal Health"
  php tools/ingest_knowledge.php --test             (verify Ollama connection + embedding)

Options:
  --file    Path to a single PDF or EPUB file
  --dir     Path to a directory of PDF/EPUB files (processed recursively)
  --author  Author name to tag all chunks from this run
  --list    List all indexed sources with chunk counts
  --clear   Remove all chunks from a named source

HELP;
    exit(0);
}

$pdo = getDBConnection();

// ── Test mode ─────────────────────────────────────────────────────────────────
if (isset($opts['test'])) {
    echo "Testing Ollama connection at " . OLLAMA_HOST . " ...\n\n";

    // 1. List available models
    $ch = curl_init(OLLAMA_HOST . '/api/tags');
    curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 5]);
    $resp = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $err  = curl_error($ch);
    curl_close($ch);

    if ($err || $code !== 200) {
        echo "FAIL: Cannot reach Ollama — " . ($err ?: "HTTP $code") . "\n";
        exit(1);
    }
    $models = json_decode($resp, true);
    $names  = array_column($models['models'] ?? [], 'name');
    echo "OK: Ollama reachable. Models installed:\n";
    foreach ($names as $name) echo "  - $name\n";

    $embedFound = false;
    foreach ($names as $n) {
        if (str_starts_with($n, EMBED_MODEL)) { $embedFound = true; break; }
    }
    if (!$embedFound) {
        echo "\nWARNING: '" . EMBED_MODEL . "' not found in model list.\n";
        echo "Run on your Ollama server:  ollama pull " . EMBED_MODEL . "\n\n";
    }

    // 2. Test embedding
    echo "\nTesting embedding endpoint ...\n";
    [$embedding, $apiUsed, $rawResp, $httpCode] = getEmbeddingDebug("soil calcium deficiency test sentence");

    echo "HTTP code:   $httpCode\n";
    echo "API used:    $apiUsed\n";
    if ($embedding !== null) {
        echo "OK: Got " . count($embedding) . "-dimensional embedding vector.\n";
        echo "Sample: [" . implode(', ', array_map(fn($v) => round($v, 4), array_slice($embedding, 0, 5))) . " ...]\n";
    } else {
        echo "FAIL: No embedding returned.\n";
        echo "Raw response: $rawResp\n";
        echo "\nPossible fixes:\n";
        echo "  1. Run: ollama pull " . EMBED_MODEL . "\n";
        echo "  2. Check Ollama version: ollama --version (need >= 0.1.20)\n";
        echo "  3. Verify host is reachable: curl http://192.168.8.73:11434/api/tags\n";
    }
    exit(0);
}

// ── List mode ────────────────────────────────────────────────────────────────
if (isset($opts['list'])) {
    $stmt = $pdo->query(
        "SELECT source, author, COUNT(*) AS chunks, MAX(created_at) AS indexed_at
         FROM knowledge_chunks GROUP BY source, author ORDER BY source"
    );
    $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
    if (!$rows) {
        echo "No sources indexed yet.\n";
    } else {
        printf("%-55s %-25s %6s  %s\n", 'Source', 'Author', 'Chunks', 'Indexed');
        echo str_repeat('-', 100) . "\n";
        foreach ($rows as $r) {
            printf("%-55s %-25s %6d  %s\n",
                substr($r['source'], 0, 54),
                substr($r['author'], 0, 24),
                $r['chunks'],
                $r['indexed_at']
            );
        }
    }
    exit(0);
}

// ── Clear mode ───────────────────────────────────────────────────────────────
if (!empty($opts['clear'])) {
    $title = $opts['clear'];
    $stmt  = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
    $stmt->execute([$title]);
    $count = (int)$stmt->fetchColumn();
    if ($count === 0) {
        echo "No chunks found for source: $title\n";
        exit(0);
    }
    $del = $pdo->prepare('DELETE FROM knowledge_chunks WHERE source = ?');
    $del->execute([$title]);
    echo "Deleted $count chunks for: $title\n";
    exit(0);
}

// ── Collect PDF + EPUB files ──────────────────────────────────────────────────
$files  = [];
$author = trim($opts['author'] ?? 'Unknown');

if (!empty($opts['file'])) {
    $path = $opts['file'];
    if (!is_file($path)) {
        die("File not found: $path\n");
    }
    $files[] = $path;
}

if (!empty($opts['dir'])) {
    $dir = rtrim($opts['dir'], '/\\');
    if (!is_dir($dir)) {
        die("Directory not found: $dir\n");
    }
    $it = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir));
    foreach ($it as $f) {
        if ($f->isFile() && in_array(strtolower($f->getExtension()), ['pdf', 'epub'], true)) {
            $files[] = $f->getPathname();
        }
    }
    if (!$files) {
        die("No PDF or EPUB files found in: $dir\n");
    }
}

echo "Found " . count($files) . " file(s) to ingest.\n\n";

// ── Process each file ────────────────────────────────────────────────────────
$pdfParser = new Parser();

foreach ($files as $filePath) {
    $source = pathinfo($filePath, PATHINFO_FILENAME);
    $ext    = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
    echo "Processing: $source ($ext)\n";

    // Check if already indexed
    $chk = $pdo->prepare('SELECT COUNT(*) FROM knowledge_chunks WHERE source = ?');
    $chk->execute([$source]);
    if ((int)$chk->fetchColumn() > 0) {
        echo "  Already indexed — skipping. Use --clear=\"$source\" to re-index.\n\n";
        continue;
    }

    // Extract pages as array of ['page' => int, 'text' => string]
    try {
        if ($ext === 'epub') {
            $pages = extractEpubPages($filePath);
        } else {
            $pages = extractPdfPages($pdfParser, $filePath);
        }
    } catch (Exception $e) {
        echo "  ERROR extracting text: " . $e->getMessage() . "\n\n";
        continue;
    }

    echo "  Sections/pages: " . count($pages) . "\n";

    $insertStmt = $pdo->prepare(
        'INSERT INTO knowledge_chunks (source, author, page, chunk_index, chunk_text, embedding)
         VALUES (?, ?, ?, ?, ?, ?)'
    );

    $totalChunks = 0;
    $chunkIndex  = 0;
    $wordBuffer  = [];
    $bufferPages = [];

    foreach ($pages as ['page' => $pageNum, 'text' => $pageText]) {
        if (strlen($pageText) < 50) continue;

        $words = explode(' ', $pageText);
        foreach ($words as $word) {
            $wordBuffer[]  = $word;
            $bufferPages[] = $pageNum;
        }

        // Flush when buffer reaches chunk size
        while (count($wordBuffer) >= CHUNK_WORDS) {
            $chunkWords = array_slice($wordBuffer, 0, CHUNK_WORDS);
            $chunkText  = implode(' ', $chunkWords);
            $chunkPage  = $bufferPages[0];

            if (strlen(trim($chunkText)) > 50) {
                $embedding = getEmbedding($chunkText);
                if ($embedding === null) {
                    echo "  WARNING: embedding failed for chunk $chunkIndex — skipping.\n";
                } else {
                    $insertStmt->execute([
                        $source, $author, $chunkPage, $chunkIndex,
                        $chunkText, json_encode($embedding),
                    ]);
                    $chunkIndex++;
                    $totalChunks++;
                }
            }

            // Slide window with overlap
            $step        = CHUNK_WORDS - OVERLAP_WORDS;
            $wordBuffer  = array_slice($wordBuffer, $step);
            $bufferPages = array_slice($bufferPages, $step);

            if ($chunkIndex % 20 === 0 && $chunkIndex > 0) {
                echo "  ...{$chunkIndex} chunks embedded\n";
            }
        }
    }

    // Flush remaining words as final chunk
    if (count($wordBuffer) > 30) {
        $chunkText = implode(' ', $wordBuffer);
        $embedding = getEmbedding($chunkText);
        if ($embedding !== null) {
            $insertStmt->execute([
                $source, $author, $bufferPages[0] ?? 0, $chunkIndex,
                $chunkText, json_encode($embedding),
            ]);
            $chunkIndex++;
            $totalChunks++;
        }
    }

    echo "  Done: $totalChunks chunks stored.\n\n";
}

echo "Ingestion complete.\n";
exit(0);

// ── Helpers ──────────────────────────────────────────────────────────────────

/**
 * Normalise extracted PDF text: collapse whitespace, fix ligatures, etc.
 */
function cleanText(string $text): string
{
    // Common PDF ligature replacements
    $ligatures = [
        'ﬁ' => 'fi', 'ﬂ' => 'fl', 'ﬀ' => 'ff',
        'ﬃ' => 'ffi', 'ﬄ' => 'ffl', 'ﬅ' => 'st',
    ];
    $text = strtr($text, $ligatures);

    // Collapse multiple spaces / newlines into single space
    $text = preg_replace('/\s+/', ' ', $text);

    // Remove non-printable characters except newlines
    $text = preg_replace('/[^\x09\x0A\x0D\x20-\x7E\xA0-\xFF]/u', '', $text);

    return trim($text);
}

/**
 * Extract pages from a PDF. Returns array of ['page' => int, 'text' => string].
 */
function extractPdfPages(Parser $parser, string $filePath): array
{
    $pdf    = $parser->parseFile($filePath);
    $result = [];
    foreach ($pdf->getPages() as $pageNum => $page) {
        $text = cleanText($page->getText());
        if (strlen($text) >= 50) {
            $result[] = ['page' => $pageNum + 1, 'text' => $text];
        }
    }
    return $result;
}

/**
 * Extract chapters/sections from an EPUB as pages.
 * Returns array of ['page' => int, 'text' => string].
 *
 * EPUBs are ZIP archives containing XHTML spine items. We:
 *  1. Parse META-INF/container.xml to find the OPF file
 *  2. Parse the OPF manifest + spine for reading order
 *  3. Strip HTML tags from each spine XHTML file
 */
function extractEpubPages(string $filePath): array
{
    $zip = new ZipArchive();
    if ($zip->open($filePath) !== true) {
        throw new RuntimeException("Cannot open EPUB file: $filePath");
    }

    // 1. Locate OPF via container.xml
    $containerXml = $zip->getFromName('META-INF/container.xml');
    if ($containerXml === false) {
        $zip->close();
        throw new RuntimeException("No META-INF/container.xml found — may not be a valid EPUB");
    }

    $dom = new DOMDocument();
    @$dom->loadXML($containerXml);
    $xp = new DOMXPath($dom);
    $xp->registerNamespace('c', 'urn:oasis:names:tc:opendocument:xmlns:container');
    $nodes = $xp->query('//c:rootfile/@full-path');
    if (!$nodes || $nodes->length === 0) {
        $zip->close();
        throw new RuntimeException("Cannot find OPF path in container.xml");
    }
    $opfPath = $nodes->item(0)->nodeValue;
    $opfDir  = dirname($opfPath);
    if ($opfDir === '.') $opfDir = '';

    // 2. Parse OPF for spine order
    $opfXml = $zip->getFromName($opfPath);
    if ($opfXml === false) {
        $zip->close();
        throw new RuntimeException("Cannot read OPF file: $opfPath");
    }

    $opfDom = new DOMDocument();
    @$opfDom->loadXML($opfXml);
    $opfXp = new DOMXPath($opfDom);

    // Build manifest: id -> href
    $manifest = [];
    foreach ($opfXp->query('//*[local-name()="item"]') as $item) {
        $manifest[$item->getAttribute('id')] = $item->getAttribute('href');
    }

    // Spine: ordered list of idrefs
    $spineHrefs = [];
    foreach ($opfXp->query('//*[local-name()="itemref"]') as $ref) {
        $idref = $ref->getAttribute('idref');
        if (isset($manifest[$idref])) {
            $spineHrefs[] = $manifest[$idref];
        }
    }

    // 3. Extract text from each spine item
    $result = [];
    foreach ($spineHrefs as $i => $href) {
        $href = urldecode($href);
        if (($pos = strpos($href, '#')) !== false) {
            $href = substr($href, 0, $pos);
        }
        $fullPath = $opfDir !== '' ? $opfDir . '/' . $href : $href;
        $html = $zip->getFromName($fullPath);
        if ($html === false) continue;

        // Strip tags and decode HTML entities
        $text = strip_tags($html);
        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
        $text = cleanText($text);

        if (strlen($text) >= 50) {
            $result[] = ['page' => $i + 1, 'text' => $text];
        }
    }

    $zip->close();

    if (empty($result)) {
        throw new RuntimeException("No readable text found in EPUB — may be image-only or DRM-protected");
    }

    return $result;
}

/**
 * Call Ollama to embed text. Tries the newer /api/embed endpoint first
 * (Ollama >= 0.1.26, uses "input" key, returns "embeddings" array),
 * then falls back to the legacy /api/embeddings (uses "prompt" key,
 * returns "embedding" array). Returns float[] or null on failure.
 */
function getEmbedding(string $text): ?array
{
    [$embedding] = getEmbeddingDebug($text);
    return $embedding;
}

/**
 * Same as getEmbedding() but returns [embedding|null, apiUsed, rawResponse, httpCode]
 * for diagnostic output.
 */
function getEmbeddingDebug(string $text): array
{
    // ── Try new API: POST /api/embed  {"model":..., "input":...} ────────────
    $payload = json_encode(['model' => EMBED_MODEL, 'input' => $text]);

    $ch = curl_init(OLLAMA_HOST . '/api/embed');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => $payload,
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => 30,
        CURLOPT_CONNECTTIMEOUT => 5,
    ]);
    $response = curl_exec($ch);
    $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($response && $httpCode === 200) {
        $data = json_decode($response, true);
        // New API returns { "embeddings": [[...]] }
        $emb = $data['embeddings'][0] ?? null;
        if (is_array($emb) && count($emb) > 0) {
            return [$emb, '/api/embed (new)', $response, $httpCode];
        }
    }

    // ── Fallback: legacy /api/embeddings  {"model":..., "prompt":...} ───────
    $payload = json_encode(['model' => EMBED_MODEL, 'prompt' => $text]);

    $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => $payload,
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => 30,
        CURLOPT_CONNECTTIMEOUT => 5,
    ]);
    $response2 = curl_exec($ch);
    $httpCode2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($response2 && $httpCode2 === 200) {
        $data2 = json_decode($response2, true);
        // Legacy API returns { "embedding": [...] }
        $emb2 = $data2['embedding'] ?? null;
        if (is_array($emb2) && count($emb2) > 0) {
            return [$emb2, '/api/embeddings (legacy)', $response2, $httpCode2];
        }
    }

    // Return last response for diagnostics
    $lastResp = $response2 ?: $response ?: '';
    $lastCode = $httpCode2 ?: $httpCode;
    return [null, 'both failed', $lastResp, $lastCode];
}