ollamaGenerate.php 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. <?php
  2. /**
  3. * controllers/ollamaGenerate.php
  4. *
  5. * AJAX POST handler: generates AI agronomic text using Ollama, grounded
  6. * with relevant passages retrieved from the soil science knowledge base
  7. * (William A. Albrecht et al.) via RAG (Retrieval-Augmented Generation).
  8. *
  9. * Flow:
  10. * 1. Load full soil record + specification ranges
  11. * 2. Build a structured data summary covering ALL measured elements
  12. * 3. Embed that summary via nomic-embed-text → retrieve top-K book passages
  13. * 4. Inject retrieved passages + data into a section-specific prompt
  14. * 5. Send to llama3.1 and return the generated text
  15. *
  16. * POST params:
  17. * csrf_token string
  18. * rid int soil_records.id
  19. * rand string soil_records.rand
  20. * section string overview | ai_interpretation | foliar | microbial
  21. */
  22. if (session_status() === PHP_SESSION_NONE) {
  23. session_start();
  24. }
  25. require_once __DIR__ . '/../config/database.php';
  26. require_once __DIR__ . '/../lib/auth.php';
  27. require_once __DIR__ . '/../lib/csrf.php';
  28. header('Content-Type: application/json');
  29. // ── Config ───────────────────────────────────────────────────────────────────
  30. define('OLLAMA_HOST', 'http://192.168.8.73:11434');
  31. define('OLLAMA_MODEL', 'llama3.1:8b-instruct-q4_K_M');
  32. define('EMBED_MODEL', 'nomic-embed-text');
  33. define('RAG_TOP_K', 6); // number of knowledge chunks to inject per request
  34. define('OLLAMA_TIMEOUT', 180); // seconds
  35. // ── Auth + CSRF ───────────────────────────────────────────────────────────────
  36. if (!isLoggedIn()) {
  37. http_response_code(401);
  38. echo json_encode(['success' => false, 'error' => 'Not authenticated']);
  39. exit;
  40. }
  41. if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
  42. http_response_code(405);
  43. echo json_encode(['success' => false, 'error' => 'Method not allowed']);
  44. exit;
  45. }
  46. if (!verifyCsrfToken($_POST['csrf_token'] ?? '')) {
  47. http_response_code(403);
  48. echo json_encode(['success' => false, 'error' => 'Invalid CSRF token']);
  49. exit;
  50. }
  51. $recordId = (int)trim($_POST['rid'] ?? '');
  52. $randId = trim($_POST['rand'] ?? '');
  53. $section = trim($_POST['section'] ?? '');
  54. $validSections = ['overview', 'ai_interpretation', 'foliar', 'microbial'];
  55. if (!$recordId || $randId === '' || !in_array($section, $validSections, true)) {
  56. http_response_code(400);
  57. echo json_encode(['success' => false, 'error' => 'Invalid parameters']);
  58. exit;
  59. }
  60. // ── Load soil record + spec ───────────────────────────────────────────────────
  61. try {
  62. $pdo = getDBConnection();
  63. $stmt = $pdo->prepare('SELECT * FROM soil_records WHERE id = ? AND rand = ?');
  64. $stmt->execute([$recordId, $randId]);
  65. $row = $stmt->fetch(PDO::FETCH_ASSOC);
  66. if (!$row) {
  67. http_response_code(404);
  68. echo json_encode(['success' => false, 'error' => 'Record not found']);
  69. exit;
  70. }
  71. $spec = [];
  72. if (!empty($row['soil_type'])) {
  73. $stmtSpec = $pdo->prepare('SELECT * FROM soil_specifications WHERE soil_type = ? LIMIT 1');
  74. $stmtSpec->execute([$row['soil_type']]);
  75. $spec = $stmtSpec->fetch(PDO::FETCH_ASSOC) ?: [];
  76. }
  77. } catch (PDOException $e) {
  78. error_log('DB error in ollamaGenerate.php: ' . $e->getMessage());
  79. http_response_code(500);
  80. echo json_encode(['success' => false, 'error' => 'Database error']);
  81. exit;
  82. }
  83. // ── Helper: safe float format ────────────────────────────────────────────────
  84. function fv(mixed $v, int $dp = 2): string
  85. {
  86. if ($v === null || $v === '') return 'N/A';
  87. return is_numeric($v) ? number_format((float)$v, $dp) : (string)$v;
  88. }
  89. // ── Helper: status vs spec range ─────────────────────────────────────────────
  90. function rangeStatus(mixed $value, mixed $min, mixed $max): string
  91. {
  92. if (!is_numeric($value)) return '';
  93. $v = (float)$value;
  94. $lo = is_numeric($min) ? (float)$min : null;
  95. $hi = is_numeric($max) ? (float)$max : null;
  96. if ($lo !== null && $v < $lo) return '[DEFICIENT]';
  97. if ($hi !== null && $v > $hi) return '[EXCESS]';
  98. if ($lo !== null || $hi !== null) return '[IDEAL]';
  99. return '';
  100. }
  101. // ── Helper: resolve spec value from spec row then record row ─────────────────
  102. function sv(array $spec, array $row, string $col): mixed
  103. {
  104. if (isset($spec[$col]) && $spec[$col] !== '' && $spec[$col] !== null) return $spec[$col];
  105. if (isset($row[$col]) && $row[$col] !== '' && $row[$col] !== null) return $row[$col];
  106. return null;
  107. }
  108. $r = $row;
  109. $s = $spec;
  110. // ── Build comprehensive soil data block ───────────────────────────────────────
  111. // Includes ALL measured elements with status against spec targets
  112. $soilData = <<<TEXT
  113. =====================================
  114. SOIL TEST DATA — COMPLETE ANALYSIS
  115. =====================================
  116. Client: {$r['client_name']}
  117. Location: {$r['site_address']}, {$r['state_postcode']}
  118. Crop: {$r['sample_id']}
  119. Crop Type: {$r['crop_type']}
  120. Soil Type: {$r['soil_type']}
  121. Lab No: {$r['lab_no']}
  122. Batch: {$r['batch_no']}
  123. Date Sampled: {$r['date_sampled']}
  124. --- SOIL PHYSICAL / REACTION ---
  125. pH (H2O): {fv($r['ph_h2o'], 1)} [target: 6.2–6.8] {rangeStatus($r['ph_h2o'], 6.2, 6.8)}
  126. pH (CaCl2): {fv($r['ph_cacl2'], 1)}
  127. EC (mS/cm): {fv($r['ec'], 2)}
  128. Colour: {$r['colour']}
  129. Texture: {$r['texture']}
  130. Gravel (%): {fv($r['gravel'], 1)}
  131. --- ORGANIC MATTER ---
  132. Organic Carbon (%): {fv($r['ocarbon'], 1)}
  133. Organic Matter (%): {fv($r['omatter'], 1)}
  134. --- CATION EXCHANGE ---
  135. CEC (meq/100g): {fv($r['cec'], 2)}
  136. TEC (meq/100g): {fv($r['tec'], 2)}
  137. Paramagnetic: {fv($r['paramag'], 0)}
  138. --- NITROGEN ---
  139. Nitrate-N (NO3-N ppm): {fv($r['NO3_N'], 0)} [target: 10–20 ppm] {rangeStatus($r['NO3_N'], 10, 20)}
  140. Ammonium-N (NH3-N ppm): {fv($r['NH3_N'], 0)}
  141. Total N (est. from C:N): C:N ratio {fv($r['c_n_ratio'], 1)}
  142. --- PHOSPHORUS ---
  143. P Colwell (ppm): {fv($r['p_colwell'], 0)}
  144. P Morgan (ppm): {fv($r['p_morgan'], 0)}
  145. P Mehlick (ppm): {fv($r['p_mehlick'], 0)}
  146. P Bray2 (ppm): {fv($r['p_bray2'], 0)}
  147. --- MAJOR CATIONS (ppm) ---
  148. Calcium Ca (ppm): {fv($r['BS_ca_ppm'], 0)} [min: {fv(sv($s,$r,'ca_ppm_min'),0)}, max: {fv(sv($s,$r,'ca_ppm_max'),0)}] {rangeStatus($r['BS_ca_ppm'], sv($s,$r,'ca_ppm_min'), sv($s,$r,'ca_ppm_max'))}
  149. Magnesium Mg (ppm): {fv($r['BS_mg_ppm'], 0)} [min: {fv(sv($s,$r,'mg_ppm_min'),0)}, max: {fv(sv($s,$r,'mg_ppm_max'),0)}] {rangeStatus($r['BS_mg_ppm'], sv($s,$r,'mg_ppm_min'), sv($s,$r,'mg_ppm_max'))}
  150. Potassium K (ppm): {fv($r['BS_k_ppm'], 0)} [min: {fv(sv($s,$r,'k_ppm_min'), 0)}, max: {fv(sv($s,$r,'k_ppm_max'), 0)}] {rangeStatus($r['BS_k_ppm'], sv($s,$r,'k_ppm_min'), sv($s,$r,'k_ppm_max'))}
  151. Sodium Na (ppm): {fv($r['BS_na_ppm'], 0)} [min: {fv(sv($s,$r,'na_ppm_min'),0)}, max: {fv(sv($s,$r,'na_ppm_max'),0)}] {rangeStatus($r['BS_na_ppm'], sv($s,$r,'na_ppm_min'), sv($s,$r,'na_ppm_max'))}
  152. --- BASE SATURATIONS (%) ---
  153. Calcium Ca (%): {fv($r['BS_ca2'], 2)}% [min: {fv(sv($s,$r,'cabs_min'),1)}, max: {fv(sv($s,$r,'cabs_max'),1)}] {rangeStatus($r['BS_ca2'], sv($s,$r,'cabs_min'), sv($s,$r,'cabs_max'))}
  154. Magnesium Mg (%): {fv($r['BS_mg2'], 2)}% [min: {fv(sv($s,$r,'mgbs_min'),1)}, max: {fv(sv($s,$r,'mgbs_max'),1)}] {rangeStatus($r['BS_mg2'], sv($s,$r,'mgbs_min'), sv($s,$r,'mgbs_max'))}
  155. Potassium K (%): {fv($r['BS_k'], 2)}% [min: {fv(sv($s,$r,'kbs_min'), 1)}, max: {fv(sv($s,$r,'kbs_max'), 1)}] {rangeStatus($r['BS_k'], sv($s,$r,'kbs_min'), sv($s,$r,'kbs_max'))}
  156. Sodium Na (%): {fv($r['BS_na'], 2)}% [min: {fv(sv($s,$r,'nabs_min'),1)}, max: {fv(sv($s,$r,'nabs_max'),1)}] {rangeStatus($r['BS_na'], sv($s,$r,'nabs_min'), sv($s,$r,'nabs_max'))}
  157. Other Bases (%): {fv($r['BS_ob'], 2)}% [recommended: {fv(sv($s,$r,'ob_rec'),1)}]
  158. Hydrogen (%): {fv($r['BS_h'], 2)}% [recommended: {fv(sv($s,$r,'h_rec'), 1)}]
  159. Aluminium Al3 (%): {fv($r['BS_al3'], 2)}%
  160. --- MORGANS EXTRACT (ppm) ---
  161. Ca Morgan: {fv($r['ca_morgan'], 2)}
  162. Mg Morgan: {fv($r['mg_morgan'], 2)}
  163. K Morgan: {fv($r['k_morgan'], 2)}
  164. Na Morgan: {fv($r['na_morgan'], 2)}
  165. --- MEHLICK-3 EXTRACT (ppm) ---
  166. Ca Mehlick3: {fv($r['ca_mehlick3'], 2)}
  167. Mg Mehlick3: {fv($r['mg_mehlick3'], 2)}
  168. K Mehlick3: {fv($r['k_mehlick3'], 2)}
  169. Na Mehlick3: {fv($r['na_mehlick3'], 2)}
  170. Al Mehlick3: {fv($r['al_mehlick3'], 2)}
  171. --- TRACE ELEMENTS (ppm) ---
  172. Sulfur S (ppm): {fv($r['s_morgan'], 2)}
  173. Boron B (ppm): {fv($r['b_cacl2'], 2)}
  174. Manganese Mn (ppm): {fv($r['mn_dtpa'], 2)}
  175. Copper Cu (ppm): {fv($r['cu_dtpa'], 2)}
  176. Zinc Zn (ppm): {fv($r['zn_dtpa'], 2)}
  177. Iron Fe (ppm): {fv($r['fe_dtpa'], 2)}
  178. Iron Fe (total): {fv($r['fe'], 2)}
  179. Aluminium Al (ppm): {fv($r['al'], 2)}
  180. Silicon Si (ppm): {fv($r['sl_cacl2'], 2)}
  181. Cobalt Co (ppm): {fv($r['co_dtpa'], 2)}
  182. Molybdenum M (ppm): {fv($r['m_dtpa'], 2)}
  183. Selenium Se (ppm): {fv($r['se'], 2)}
  184. --- RATIOS ---
  185. Ca:Mg ratio: {fv(is_numeric($r['ca_mehlick3']) && is_numeric($r['mg_mehlick3']) && (float)$r['mg_mehlick3'] != 0 ? round((float)$r['ca_mehlick3']/(float)$r['mg_mehlick3'],1) : null, 1)} [recommended: {fv(sv($s,$r,'ca_mg_ratio'),1)}]
  186. C:N ratio: {fv($r['c_n_ratio'], 1)}
  187. --- DEFICIENT ELEMENTS SUMMARY ---
  188. TEXT;
  189. // Append a quick plain-English deficiency list to help the LLM focus
  190. $deficiencies = [];
  191. $excesses = [];
  192. $checkElements = [
  193. ['pH (H2O)', $r['ph_h2o'], 6.2, 6.8],
  194. ['Nitrate-N', $r['NO3_N'], 10, 20],
  195. ['Calcium (ppm)', $r['BS_ca_ppm'], sv($s,$r,'ca_ppm_min'), sv($s,$r,'ca_ppm_max')],
  196. ['Magnesium (ppm)', $r['BS_mg_ppm'], sv($s,$r,'mg_ppm_min'), sv($s,$r,'mg_ppm_max')],
  197. ['Potassium (ppm)', $r['BS_k_ppm'], sv($s,$r,'k_ppm_min'), sv($s,$r,'k_ppm_max')],
  198. ['Sodium (ppm)', $r['BS_na_ppm'], sv($s,$r,'na_ppm_min'), sv($s,$r,'na_ppm_max')],
  199. ['Ca sat (%)', $r['BS_ca2'], sv($s,$r,'cabs_min'), sv($s,$r,'cabs_max')],
  200. ['Mg sat (%)', $r['BS_mg2'], sv($s,$r,'mgbs_min'), sv($s,$r,'mgbs_max')],
  201. ['K sat (%)', $r['BS_k'], sv($s,$r,'kbs_min'), sv($s,$r,'kbs_max')],
  202. ['Na sat (%)', $r['BS_na'], sv($s,$r,'nabs_min'), sv($s,$r,'nabs_max')],
  203. ];
  204. foreach ($checkElements as [$label, $val, $lo, $hi]) {
  205. if (!is_numeric($val)) continue;
  206. $v = (float)$val;
  207. if (is_numeric($lo) && $v < (float)$lo) $deficiencies[] = $label;
  208. if (is_numeric($hi) && $v > (float)$hi) $excesses[] = $label;
  209. }
  210. $soilData .= "\nDeficient: " . (empty($deficiencies) ? 'None detected' : implode(', ', $deficiencies));
  211. $soilData .= "\nIn Excess: " . (empty($excesses) ? 'None detected' : implode(', ', $excesses));
  212. $soilData .= "\n=====================================\n";
  213. // ── RAG: embed the soil data query, retrieve relevant book passages ───────────
  214. $knowledgeContext = '';
  215. $ragChunks = retrieveRelevantChunks($pdo, $soilData, $section, RAG_TOP_K);
  216. if (!empty($ragChunks)) {
  217. $knowledgeContext = "\n\n===================================================\n"
  218. . "RELEVANT PASSAGES FROM SOIL SCIENCE LITERATURE\n"
  219. . "(William A. Albrecht and other authorities)\n"
  220. . "===================================================\n";
  221. foreach ($ragChunks as $i => $chunk) {
  222. $knowledgeContext .= sprintf(
  223. "\n[%d] \"%s\" — %s (p.%d)\n%s\n",
  224. $i + 1,
  225. $chunk['source'],
  226. $chunk['author'],
  227. $chunk['page'],
  228. $chunk['chunk_text']
  229. );
  230. }
  231. }
  232. // ── Section-specific system prompts ──────────────────────────────────────────
  233. $systemInstruction = "You are a certified agronomist specialising in soil fertility, "
  234. . "trained in the Albrecht method of soil balancing. "
  235. . "You have deep knowledge of soil chemistry, plant nutrition, and the relationship "
  236. . "between soil mineral balance and crop/livestock health. "
  237. . "Always ground your recommendations in the measured data. "
  238. . "For Australian conditions, reference typical soil types and climate where relevant. "
  239. . "Write in a professional but accessible tone suitable for a farmer-facing report. "
  240. . "When the knowledge passages conflict with your training, prefer the passages — they "
  241. . "are from authoritative soil science texts.";
  242. $baseContext = $soilData . $knowledgeContext;
  243. $prompts = [
  244. 'overview' =>
  245. $systemInstruction . "\n\n" . $baseContext
  246. . "\n\nTASK: Write an executive overview of these soil test results (3–4 paragraphs). "
  247. . "Cover: (1) overall soil health and fertility level, "
  248. . "(2) the most significant deficiencies or imbalances and their likely effect on crop performance, "
  249. . "(3) any positive attributes of this soil. "
  250. . "Use the Albrecht philosophy as a framework where applicable. "
  251. . "Do not list specific product names in this section.",
  252. 'ai_interpretation' =>
  253. $systemInstruction . "\n\n" . $baseContext
  254. . "\n\nTASK: Write a detailed technical interpretation of ALL elements in this soil test. "
  255. . "Structure your response with these sections:\n"
  256. . "1. SOIL REACTION (pH, EC, Paramagnetic)\n"
  257. . "2. ORGANIC MATTER & BIOLOGY (C, N, C:N ratio)\n"
  258. . "3. CATION EXCHANGE CAPACITY & BASE SATURATIONS\n"
  259. . "4. MAJOR ELEMENTS (Ca, Mg, K, Na, P — ppm and saturation %)\n"
  260. . "5. TRACE ELEMENTS (S, B, Mn, Cu, Zn, Fe, Al, Si, Co, Mo, Se)\n"
  261. . "6. ELEMENTAL RATIOS & INTERACTIONS (Ca:Mg, C:N, K:Mg antagonisms)\n"
  262. . "7. OVERALL SOIL BALANCE ASSESSMENT\n"
  263. . "For each element marked [DEFICIENT] or [EXCESS], explain the agronomic significance "
  264. . "and interactions with other elements. Reference the Albrecht literature where relevant.",
  265. 'foliar' =>
  266. $systemInstruction . "\n\n" . $baseContext
  267. . "\n\nTASK: Design a foliar nutrition program to address the deficiencies shown. "
  268. . "Format the program as a table or numbered list with: "
  269. . "Growth Stage | Product Type | Active Element | Rate (L or kg/ha) | Timing/Frequency. "
  270. . "Prioritise elements marked [DEFICIENT]. "
  271. . "Note any antagonisms (e.g. Ca/Mg competition, Zn/P interaction, K/Mg lockout). "
  272. . "Keep product recommendations generic (e.g. 'chelated zinc', 'calcium nitrate') "
  273. . "rather than brand names. "
  274. . "Add a note on carrier water pH and adjuvant recommendations.",
  275. 'microbial' =>
  276. $systemInstruction . "\n\n" . $baseContext
  277. . "\n\nTASK: Design a biological/microbial soil improvement program. "
  278. . "Consider the organic matter level, C:N ratio, pH, and base saturation balance shown. "
  279. . "Structure your response:\n"
  280. . "1. CURRENT BIOLOGY ASSESSMENT (based on OM, C:N, pH)\n"
  281. . "2. RECOMMENDED INOCULANTS (e.g. mycorrhizae, rhizobia, EM, compost tea)\n"
  282. . "3. CARBON FEEDING STRATEGY (humates, fish hydrolysate, molasses, cover crops)\n"
  283. . "4. TIMING & INTEGRATION with the soil balancing program\n"
  284. . "Reference Albrecht's work on the relationship between mineral balance and soil biology.",
  285. ];
  286. // ── Call Ollama ───────────────────────────────────────────────────────────────
  287. $payload = json_encode([
  288. 'model' => OLLAMA_MODEL,
  289. 'prompt' => $prompts[$section],
  290. 'stream' => false,
  291. 'options' => [
  292. 'temperature' => 0.3, // lower = more factual / less creative
  293. 'num_predict' => 2048,
  294. ],
  295. ]);
  296. $ch = curl_init(OLLAMA_HOST . '/api/generate');
  297. curl_setopt_array($ch, [
  298. CURLOPT_POST => true,
  299. CURLOPT_POSTFIELDS => $payload,
  300. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  301. CURLOPT_RETURNTRANSFER => true,
  302. CURLOPT_TIMEOUT => OLLAMA_TIMEOUT,
  303. CURLOPT_CONNECTTIMEOUT => 5,
  304. ]);
  305. $response = curl_exec($ch);
  306. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  307. $curlErr = curl_error($ch);
  308. curl_close($ch);
  309. if ($curlErr || $response === false) {
  310. http_response_code(502);
  311. echo json_encode(['success' => false, 'error' => 'Could not connect to Ollama: ' . ($curlErr ?: 'no response')]);
  312. exit;
  313. }
  314. if ($httpCode !== 200) {
  315. http_response_code(502);
  316. echo json_encode(['success' => false, 'error' => 'Ollama returned HTTP ' . $httpCode]);
  317. exit;
  318. }
  319. $ollamaData = json_decode($response, true);
  320. $text = trim($ollamaData['response'] ?? '');
  321. if ($text === '') {
  322. http_response_code(502);
  323. echo json_encode(['success' => false, 'error' => 'Ollama returned an empty response']);
  324. exit;
  325. }
  326. echo json_encode([
  327. 'success' => true,
  328. 'text' => $text,
  329. 'rag_chunks_used' => count($ragChunks),
  330. ]);
  331. exit;
  332. // ── RAG retrieval ────────────────────────────────────────────────────────────
  333. /**
  334. * Embed a query string, then retrieve the top-K most similar knowledge chunks.
  335. * Falls back to MySQL FULLTEXT search if no embeddings are in the table or
  336. * if the embedding API is unavailable.
  337. *
  338. * @param PDO $pdo
  339. * @param string $queryText The soil data summary used as the retrieval query
  340. * @param string $section Current section (used to build keyword fallback)
  341. * @param int $topK
  342. * @return array Array of row arrays (source, author, page, chunk_text)
  343. */
  344. function retrieveRelevantChunks(PDO $pdo, string $queryText, string $section, int $topK): array
  345. {
  346. // Check if we have any chunks at all
  347. $count = (int)$pdo->query('SELECT COUNT(*) FROM knowledge_chunks')->fetchColumn();
  348. if ($count === 0) {
  349. return []; // Knowledge base not yet populated
  350. }
  351. // ── Try vector similarity search first ──────────────────────────────────
  352. $queryEmbedding = getQueryEmbedding($queryText);
  353. if ($queryEmbedding !== null) {
  354. return vectorSearch($pdo, $queryEmbedding, $topK);
  355. }
  356. // ── Fallback: MySQL FULLTEXT search ─────────────────────────────────────
  357. return fulltextSearch($pdo, $section, $topK);
  358. }
  359. /**
  360. * Embed text via Ollama. Tries new /api/embed first, falls back to legacy
  361. * /api/embeddings. Returns float[] or null on failure.
  362. */
  363. function getQueryEmbedding(string $text): ?array
  364. {
  365. $queryText = substr($text, 0, 2000);
  366. // ── New API (/api/embed, Ollama >= 0.1.26) ───────────────────────────────
  367. $ch = curl_init(OLLAMA_HOST . '/api/embed');
  368. curl_setopt_array($ch, [
  369. CURLOPT_POST => true,
  370. CURLOPT_POSTFIELDS => json_encode(['model' => EMBED_MODEL, 'input' => $queryText]),
  371. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  372. CURLOPT_RETURNTRANSFER => true,
  373. CURLOPT_TIMEOUT => 15,
  374. CURLOPT_CONNECTTIMEOUT => 3,
  375. ]);
  376. $resp = curl_exec($ch);
  377. $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  378. curl_close($ch);
  379. if ($resp && $code === 200) {
  380. $data = json_decode($resp, true);
  381. $emb = $data['embeddings'][0] ?? null;
  382. if (is_array($emb) && count($emb) > 0) return $emb;
  383. }
  384. // ── Legacy API (/api/embeddings) ─────────────────────────────────────────
  385. $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
  386. curl_setopt_array($ch, [
  387. CURLOPT_POST => true,
  388. CURLOPT_POSTFIELDS => json_encode(['model' => EMBED_MODEL, 'prompt' => $queryText]),
  389. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  390. CURLOPT_RETURNTRANSFER => true,
  391. CURLOPT_TIMEOUT => 15,
  392. CURLOPT_CONNECTTIMEOUT => 3,
  393. ]);
  394. $resp2 = curl_exec($ch);
  395. $code2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  396. curl_close($ch);
  397. if ($resp2 && $code2 === 200) {
  398. $data2 = json_decode($resp2, true);
  399. $emb2 = $data2['embedding'] ?? null;
  400. if (is_array($emb2) && count($emb2) > 0) return $emb2;
  401. }
  402. return null;
  403. }
  404. /**
  405. * Load all chunk embeddings from DB, compute cosine similarity, return top-K.
  406. * For corpora up to ~10k chunks this is fast enough in PHP.
  407. */
  408. function vectorSearch(PDO $pdo, array $queryVec, int $topK): array
  409. {
  410. $stmt = $pdo->query(
  411. 'SELECT id, source, author, page, chunk_text, embedding FROM knowledge_chunks'
  412. );
  413. $scores = [];
  414. while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
  415. $chunkVec = json_decode($row['embedding'], true);
  416. if (!is_array($chunkVec)) continue;
  417. $sim = cosineSimilarity($queryVec, $chunkVec);
  418. $scores[] = [
  419. 'score' => $sim,
  420. 'source' => $row['source'],
  421. 'author' => $row['author'],
  422. 'page' => $row['page'],
  423. 'chunk_text' => $row['chunk_text'],
  424. ];
  425. }
  426. // Sort descending by score, return top-K
  427. usort($scores, fn($a, $b) => $b['score'] <=> $a['score']);
  428. return array_slice($scores, 0, $topK);
  429. }
  430. /**
  431. * MySQL FULLTEXT fallback when embeddings aren't available.
  432. */
  433. function fulltextSearch(PDO $pdo, string $section, int $topK): array
  434. {
  435. // Section-specific keyword hints for the search
  436. $keywords = [
  437. 'overview' => 'soil fertility mineral balance calcium magnesium',
  438. 'ai_interpretation' => 'base saturation calcium magnesium potassium pH organic matter',
  439. 'foliar' => 'foliar nutrition trace elements deficiency correction spray',
  440. 'microbial' => 'soil biology microbial organic matter carbon nitrogen humus',
  441. ];
  442. $query = $keywords[$section] ?? 'soil fertility mineral nutrition';
  443. try {
  444. $stmt = $pdo->prepare(
  445. 'SELECT source, author, page, chunk_text,
  446. MATCH(chunk_text) AGAINST(? IN NATURAL LANGUAGE MODE) AS score
  447. FROM knowledge_chunks
  448. WHERE MATCH(chunk_text) AGAINST(? IN NATURAL LANGUAGE MODE)
  449. ORDER BY score DESC
  450. LIMIT ?'
  451. );
  452. $stmt->execute([$query, $query, $topK]);
  453. return $stmt->fetchAll(PDO::FETCH_ASSOC);
  454. } catch (PDOException $e) {
  455. error_log('RAG fulltext search failed: ' . $e->getMessage());
  456. return [];
  457. }
  458. }
  459. /**
  460. * Cosine similarity between two equal-length float vectors.
  461. */
  462. function cosineSimilarity(array $a, array $b): float
  463. {
  464. $dot = 0.0;
  465. $normA = 0.0;
  466. $normB = 0.0;
  467. $len = min(count($a), count($b));
  468. for ($i = 0; $i < $len; $i++) {
  469. $dot += $a[$i] * $b[$i];
  470. $normA += $a[$i] * $a[$i];
  471. $normB += $b[$i] * $b[$i];
  472. }
  473. $denom = sqrt($normA) * sqrt($normB);
  474. return $denom > 0 ? $dot / $denom : 0.0;
  475. }