soilImportController.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. <?php
  2. /**
  3. * controllers/soilImportController.php
  4. *
  5. * Handles XLS/XLSX/CSV upload from soil labs, parses the file with
  6. * PhpSpreadsheet, then uses the local Ollama LLM to map lab-specific
  7. * column headers to the soil_records database fields.
  8. *
  9. * POST /controllers/soilImportController.php
  10. * Accepts multipart/form-data with:
  11. * file — the uploaded spreadsheet
  12. * action — "parse" → return list of samples found in the file
  13. * "import" → return mapped field values for one sample
  14. * sample_idx — (import only) 0-based index of the sample to import
  15. */
  16. require_once __DIR__ . '/../config/database.php';
  17. require_once __DIR__ . '/../config/ai.php';
  18. require_once __DIR__ . '/../lib/auth.php';
  19. if (session_status() === PHP_SESSION_NONE) {
  20. session_start();
  21. }
  22. requireLogin();
  23. header('Content-Type: application/json');
  24. // ─── helpers ─────────────────────────────────────────────────────────────────
  25. function jsonError(string $message, int $code = 400): never
  26. {
  27. http_response_code($code);
  28. echo json_encode(['success' => false, 'error' => $message]);
  29. exit;
  30. }
  31. function jsonOk(array $data): never
  32. {
  33. echo json_encode(['success' => true, ...$data]);
  34. exit;
  35. }
  36. // ─── request validation ───────────────────────────────────────────────────────
  37. if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
  38. jsonError('Method not allowed', 405);
  39. }
  40. $action = $_POST['action'] ?? 'parse';
  41. if (!in_array($action, ['parse', 'import'], true)) {
  42. jsonError('Invalid action');
  43. }
  44. // ─── file handling ────────────────────────────────────────────────────────────
  45. if (empty($_FILES['file']) || $_FILES['file']['error'] !== UPLOAD_ERR_OK) {
  46. jsonError('No file uploaded or upload error: ' . ($_FILES['file']['error'] ?? 'missing'));
  47. }
  48. $uploadedFile = $_FILES['file'];
  49. $ext = strtolower(pathinfo($uploadedFile['name'], PATHINFO_EXTENSION));
  50. if (!in_array($ext, ['xls', 'xlsx', 'csv', 'ods'], true)) {
  51. jsonError('Unsupported file type. Please upload XLS, XLSX, CSV, or ODS.');
  52. }
  53. // ─── PhpSpreadsheet ───────────────────────────────────────────────────────────
  54. $autoloadPaths = [
  55. __DIR__ . '/../vendor/autoload.php',
  56. __DIR__ . '/../../vendor/autoload.php',
  57. ];
  58. $autoloaded = false;
  59. foreach ($autoloadPaths as $path) {
  60. if (file_exists($path)) {
  61. require_once $path;
  62. $autoloaded = true;
  63. break;
  64. }
  65. }
  66. if (!$autoloaded) {
  67. jsonError('PhpSpreadsheet not installed. Run: composer require phpoffice/phpspreadsheet', 500);
  68. }
  69. use PhpOffice\PhpSpreadsheet\IOFactory;
  70. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  71. try {
  72. $spreadsheet = IOFactory::load($uploadedFile['tmp_name']);
  73. } catch (\Exception $e) {
  74. jsonError('Could not read file: ' . $e->getMessage());
  75. }
  76. $sheet = $spreadsheet->getActiveSheet();
  77. // Convert sheet to a 2-D array (1-indexed rows and cols → 0-indexed)
  78. $rawData = [];
  79. foreach ($sheet->getRowIterator() as $row) {
  80. $cells = [];
  81. foreach ($row->getCellIterator() as $cell) {
  82. $cells[] = trim((string) $cell->getFormattedValue());
  83. }
  84. // Strip trailing empty cells
  85. while ($cells && end($cells) === '') {
  86. array_pop($cells);
  87. }
  88. if ($cells) {
  89. $rawData[] = $cells;
  90. }
  91. }
  92. if (empty($rawData)) {
  93. jsonError('The spreadsheet appears to be empty.');
  94. }
  95. // ─── format detection ─────────────────────────────────────────────────────────
  96. //
  97. // Two layouts found in CSBP lab files:
  98. //
  99. // TRANSPOSED (lab card) — Column 0 = row labels ("EC 1:5", "Total P %", …)
  100. // Columns 1-N = one sample each.
  101. // Example: SOIL CONTROL XNS06189.xls
  102. //
  103. // ROW-BASED (report) — One row = column headers; subsequent rows = samples.
  104. // May have 1-3 title/subtitle rows above the headers.
  105. // Example: S-C Soil Tests 2006.xls, YOS06 42-48.xlsx
  106. //
  107. // Detection strategy:
  108. // 1. Score column-0 values for soil-chemistry label patterns (units, element
  109. // names, "1:5", "ppm", etc.). ≥2 matches → transposed.
  110. // 2. Otherwise scan the first 10 rows for a "header row" — the row that best
  111. // matches known CSBP column-code keywords. Everything above it is a title.
  112. /**
  113. * Returns ['transposed' => bool, 'headerRow' => int]
  114. */
  115. function detectFormat(array $rawData): array
  116. {
  117. // Phrases that appear in column-0 of a transposed lab card
  118. $transposedSignals = [
  119. '1:5', 'total p', 'total k', 'total ca', 'total mg', 'total na',
  120. 'total s', 'total n', 'total b', 'total zn', 'total mn', 'total fe',
  121. 'total cu', 'total cl', 'organic matter', 'organic carbon',
  122. 'lab id', 'lab performing', 'field name', 'nitrate ppm',
  123. 'ph 1:5', 'moisture %', 'consultant',
  124. ];
  125. // Phrases that appear in the header row of a row-based file
  126. $rowBasedSignals = [
  127. 'lab_number', 'custno', 'paddock', 'ph_cacl2', 'ph_h2o',
  128. 'dtpa_cu', 'dtpa_zn', 'dtpa_mn', 'dtpa_fe', 'conducty',
  129. 'orgcarbon', 'nitrate', 'nammonium', 'texture', 'gravel',
  130. 'exc_ca', 'exc_mg', 'exc_na', 'exc_k', 'alum_cacl2',
  131. 'boron_hot', 'client name', 'lab number', 'sat_ca', 'sat_mg',
  132. 'sat_k', 'sat_na', 'crop', 'sp%',
  133. ];
  134. // Step 1: score column-0 values for transposed signals
  135. $transposedScore = 0;
  136. foreach (array_slice($rawData, 0, 20) as $row) {
  137. $cell = strtolower($row[0] ?? '');
  138. if ($cell === '') {
  139. continue;
  140. }
  141. foreach ($transposedSignals as $signal) {
  142. if (str_contains($cell, $signal)) {
  143. $transposedScore++;
  144. break;
  145. }
  146. }
  147. }
  148. if ($transposedScore >= 2) {
  149. return ['transposed' => true, 'headerRow' => 0];
  150. }
  151. // Step 2: find the best header row in a row-based file
  152. $bestRow = 0;
  153. $bestScore = 0;
  154. for ($i = 0; $i < min(10, count($rawData)); $i++) {
  155. $score = 0;
  156. foreach ($rawData[$i] as $cell) {
  157. $cell = strtolower(trim($cell));
  158. if ($cell === '') {
  159. continue;
  160. }
  161. foreach ($rowBasedSignals as $signal) {
  162. if (str_contains($cell, $signal)) {
  163. $score++;
  164. break;
  165. }
  166. }
  167. }
  168. if ($score > $bestScore) {
  169. $bestScore = $score;
  170. $bestRow = $i;
  171. }
  172. }
  173. return ['transposed' => false, 'headerRow' => $bestRow];
  174. }
  175. // ─── extract samples ──────────────────────────────────────────────────────────
  176. function extractSamplesRowBased(array $rawData, int $headerRow): array
  177. {
  178. $headers = $rawData[$headerRow];
  179. $samples = [];
  180. for ($r = $headerRow + 1; $r < count($rawData); $r++) {
  181. $row = $rawData[$r];
  182. $sample = [];
  183. foreach ($headers as $c => $header) {
  184. if ($header === '') {
  185. continue;
  186. }
  187. $sample[$header] = $row[$c] ?? '';
  188. }
  189. if (array_filter($sample)) {
  190. $samples[] = $sample;
  191. }
  192. }
  193. return $samples;
  194. }
  195. function extractSamplesTransposed(array $rawData): array
  196. {
  197. // Column 0 = labels; columns 1-N = samples.
  198. // Count sample columns from the widest row.
  199. $labels = array_column($rawData, 0);
  200. $maxCols = max(array_map('count', $rawData));
  201. $samples = [];
  202. for ($col = 1; $col < $maxCols; $col++) {
  203. $sample = [];
  204. foreach ($rawData as $rowIdx => $row) {
  205. $label = trim($labels[$rowIdx] ?? '');
  206. $value = trim($row[$col] ?? '');
  207. if ($label !== '' && $value !== '') {
  208. $sample[$label] = $value;
  209. }
  210. }
  211. // Only keep columns that have at least a few populated cells
  212. if (count(array_filter($sample)) >= 3) {
  213. $samples[] = $sample;
  214. }
  215. }
  216. return $samples;
  217. }
  218. $fmt = detectFormat($rawData);
  219. $samples = $fmt['transposed']
  220. ? extractSamplesTransposed($rawData)
  221. : extractSamplesRowBased($rawData, $fmt['headerRow']);
  222. if (empty($samples)) {
  223. jsonError('No samples found in the file.');
  224. }
  225. // ─── action: parse ────────────────────────────────────────────────────────────
  226. // Return a lightweight list of samples so the UI can let the user pick one.
  227. if ($action === 'parse') {
  228. $list = [];
  229. foreach ($samples as $idx => $sample) {
  230. // Try to find a meaningful display label
  231. $labId = $sample['LAB_NUMBER'] ?? $sample['Lab ID (Soil)'] ?? $sample['LAB_ID'] ?? "Sample " . ($idx + 1);
  232. $client = $sample['CLIENT NAME'] ?? $sample['Consultant'] ?? $sample['CUSTNO'] ?? '';
  233. $crop = $sample['CROP'] ?? $sample['Material (manure, sawdust, etc.)'] ?? '';
  234. $pad = $sample['PADDOCK'] ?? $sample['Field Name (Sample ID)'] ?? '';
  235. $list[] = [
  236. 'idx' => $idx,
  237. 'lab_id' => $labId,
  238. 'client' => $client,
  239. 'crop' => $crop,
  240. 'site' => $pad,
  241. ];
  242. }
  243. jsonOk([
  244. 'samples' => $list,
  245. 'count' => count($samples),
  246. 'format' => $fmt['transposed'] ? 'transposed' : 'row-based',
  247. 'header_row' => $fmt['headerRow'] ?? 0,
  248. ]);
  249. }
  250. // ─── action: import ───────────────────────────────────────────────────────────
  251. $sampleIdx = (int) ($_POST['sample_idx'] ?? 0);
  252. if ($sampleIdx < 0 || $sampleIdx >= count($samples)) {
  253. jsonError('Invalid sample index.');
  254. }
  255. $sampleData = $samples[$sampleIdx];
  256. // ─── Ollama field mapping ─────────────────────────────────────────────────────
  257. $labJson = json_encode($sampleData, JSON_UNESCAPED_UNICODE);
  258. $prompt = <<<EOT
  259. You are a soil laboratory data mapper. Your only job is to output a JSON object.
  260. Map the LAB DATA below to these TARGET FIELDS. Output ONLY the JSON object — no explanation, no markdown, no code fences.
  261. TARGET FIELDS:
  262. lab_no=Lab reference number/Lab ID
  263. sample_id=Sample identifier/paddock name/field name
  264. site_id=Site identifier/block/customer number
  265. date_sampled=Date sampled as YYYY-MM-DD
  266. texture=Soil texture description
  267. gravel=Gravel % (number only)
  268. colour=Soil colour
  269. ocarbon=Organic carbon % (number only)
  270. omatter=Organic matter % LOI (number only)
  271. ph_cacl2=pH in CaCl2 (number only)
  272. ph_h2o=pH in water (number only)
  273. ec=Electrical conductivity dS/m (number only)
  274. NO3_N=Nitrate-N mg/kg (number only)
  275. NH3_N=Ammonium-N mg/kg (number only)
  276. p_mehlick=Phosphorus Mehlich-3 mg/kg (number only)
  277. p_morgan=Phosphorus extractable mg/kg (number only)
  278. k_morgan=Potassium mg/kg (number only)
  279. ca_morgan=Calcium mg/kg (number only)
  280. mg_morgan=Magnesium mg/kg (number only)
  281. na_morgan=Sodium mg/kg (number only)
  282. s_morgan=Sulphur mg/kg (number only)
  283. b_cacl2=Boron CaCl2 mg/kg (number only)
  284. mn_dtpa=Manganese DTPA mg/kg (number only)
  285. zn_dtpa=Zinc DTPA mg/kg (number only)
  286. fe_dtpa=Iron DTPA mg/kg (number only)
  287. cu_dtpa=Copper DTPA mg/kg (number only)
  288. al=Aluminium mg/kg (number only)
  289. tec=Total Exchange Capacity (number only)
  290. cec=CEC meq/100g (number only)
  291. ca_mehlick3=Calcium Mehlich-3 meq/100g (number only)
  292. mg_mehlick3=Magnesium Mehlich-3 meq/100g (number only)
  293. k_mehlick3=Potassium Mehlich-3 meq/100g (number only)
  294. na_mehlick3=Sodium Mehlich-3 meq/100g (number only)
  295. al_mehlick3=Aluminium Mehlich-3 meq/100g (number only)
  296. LAB DATA: {$labJson}
  297. Rules: only use values present in the lab data. Strip units from numbers. Use null for unmapped fields. Output JSON only.
  298. EOT;
  299. $payload = json_encode([
  300. 'model' => OLLAMA_MODEL,
  301. 'prompt' => $prompt,
  302. 'stream' => false,
  303. 'options' => [
  304. 'temperature' => OLLAMA_TEMPERATURE,
  305. 'num_predict' => 1024,
  306. ],
  307. ]);
  308. $ch = curl_init(OLLAMA_HOST . '/api/generate');
  309. curl_setopt_array($ch, [
  310. CURLOPT_POST => true,
  311. CURLOPT_POSTFIELDS => $payload,
  312. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  313. CURLOPT_RETURNTRANSFER => true,
  314. CURLOPT_TIMEOUT => OLLAMA_TIMEOUT,
  315. CURLOPT_CONNECTTIMEOUT => 5,
  316. ]);
  317. $response = curl_exec($ch);
  318. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  319. $curlErr = curl_error($ch);
  320. curl_close($ch);
  321. if ($curlErr || $httpCode !== 200) {
  322. $mapped = staticFieldMap($sampleData);
  323. $warning = $curlErr ?: "Ollama HTTP {$httpCode}";
  324. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI unavailable: ' . $warning]);
  325. }
  326. $ollamaData = json_decode($response, true);
  327. $rawText = trim($ollamaData['response'] ?? '');
  328. if ($rawText === '') {
  329. $mapped = staticFieldMap($sampleData);
  330. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'Ollama returned empty response']);
  331. }
  332. // Strip any markdown code fences the model might wrap around the JSON
  333. $rawText = preg_replace('/^```(?:json)?\s*/i', '', $rawText);
  334. $rawText = preg_replace('/\s*```$/m', '', $rawText);
  335. // Extract the first JSON object if the model added commentary
  336. if (preg_match('/\{[\s\S]+\}/', $rawText, $m)) {
  337. $rawText = $m[0];
  338. }
  339. $mapped = json_decode($rawText, true);
  340. if (!is_array($mapped)) {
  341. $mapped = staticFieldMap($sampleData);
  342. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI returned unparseable JSON']);
  343. }
  344. // Remove null/empty values
  345. $mapped = array_filter($mapped, fn($v) => $v !== null && $v !== '');
  346. jsonOk(['fields' => $mapped, 'method' => 'ai']);
  347. // ─── static fallback mapper ───────────────────────────────────────────────────
  348. // Simple keyword-based mapping used when the AI is unavailable.
  349. function staticFieldMap(array $data): array
  350. {
  351. $map = [
  352. // lab_no
  353. 'lab_no' => ['LAB_NUMBER', 'Lab ID (Soil)', 'LAB_ID'],
  354. // sample / site
  355. 'sample_id' => ['PADDOCK', 'Field Name (Sample ID)', 'PADDOCK_NAME'],
  356. 'site_id' => ['CUSTNO', 'Lab performing testing'],
  357. // physical
  358. 'texture' => ['TEXTURE'],
  359. 'gravel' => ['GRAVEL'],
  360. 'colour' => ['COLOUR', 'COLOR'],
  361. // chemical
  362. 'ocarbon' => ['ORGCARBON', 'Organic Carbon %', 'Total Organic Carbon %'],
  363. 'omatter' => ['Total Organic Matter (L.O.I) %', 'Organic Matter %'],
  364. 'ph_cacl2' => ['PH_CACL2', 'PH 1:5 (CaCl2)', 'pH CaCl2', 'ph_cacl2'],
  365. 'ph_h2o' => ['PH_H2O', 'pH 1:5 (H2O)', 'pH Water'],
  366. 'ec' => ['CONDUCTY', 'EC 1:5', 'EC'],
  367. // nutrients
  368. 'NO3_N' => ['NITRATE', 'Nitrate ppm', 'Nitrate-N', 'NO3_N'],
  369. 'NH3_N' => ['NAMMONIUM', 'Ammonium', 'NH4_N'],
  370. 'p_morgan' => ['PHOS', 'Total P %', 'Phosphorus'],
  371. 'k_morgan' => ['POTASSIUM', 'Total K %', 'Potassium'],
  372. 'ca_morgan' => ['EXC_CA', 'Total Ca %', 'Calcium'],
  373. 'mg_morgan' => ['EXC_MG', 'Total Mg %', 'Magnesium'],
  374. 'na_morgan' => ['EXC_NA', 'Total Na %', 'Sodium'],
  375. 's_morgan' => ['SULPHUR', 'Total S %', 'Sulphur'],
  376. // micronutrients
  377. 'b_cacl2' => ['BORON_HOT', 'Total B ppm', 'Boron'],
  378. 'mn_dtpa' => ['DTPA_MN', 'EDTA_MN', 'Total Mn ppm', 'Manganese'],
  379. 'zn_dtpa' => ['DTPA_ZN', 'EDTA_ZN', 'Total Zn ppm', 'Zinc'],
  380. 'fe_dtpa' => ['DTPA_FE', 'EDTA_FE', 'Total Fe ppm', 'Iron', 'IRON'],
  381. 'cu_dtpa' => ['DTPA_CU', 'EDTA_CU', 'Total Cu ppm', 'Copper'],
  382. 'al' => ['ALUM_CACL2', 'EXC_AL', 'Aluminium'],
  383. // base saturation
  384. 'cec' => ['CEC', 'COND', 'SAT_COND'],
  385. 'ca_mehlick3' => ['SAT_Ca', 'SAT_CA'],
  386. 'mg_mehlick3' => ['SAT_Mg', 'SAT_MG'],
  387. 'k_mehlick3' => ['SAT_K'],
  388. 'na_mehlick3' => ['SAT_Na', 'SAT_NA'],
  389. ];
  390. $result = [];
  391. foreach ($map as $dbField => $labKeys) {
  392. foreach ($labKeys as $labKey) {
  393. // Case-insensitive search
  394. foreach ($data as $k => $v) {
  395. if (strcasecmp($k, $labKey) === 0 && $v !== '') {
  396. $result[$dbField] = $v;
  397. break 2;
  398. }
  399. }
  400. }
  401. }
  402. return $result;
  403. }