soilImportController.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. <?php
  2. /**
  3. * controllers/soilImportController.php
  4. *
  5. * Handles XLS/XLSX/CSV upload from soil labs, parses the file with
  6. * PhpSpreadsheet, then uses the local Ollama LLM to map lab-specific
  7. * column headers to the soil_records database fields.
  8. *
  9. * POST /controllers/soilImportController.php
  10. * Accepts multipart/form-data with:
  11. * file — the uploaded spreadsheet
  12. * action — "parse" → return list of samples found in the file
  13. * "import" → return mapped field values for one sample
  14. * sample_idx — (import only) 0-based index of the sample to import
  15. */
  16. require_once __DIR__ . '/../config/database.php';
  17. require_once __DIR__ . '/../config/ai.php';
  18. require_once __DIR__ . '/../lib/auth.php';
  19. if (session_status() === PHP_SESSION_NONE) {
  20. session_start();
  21. }
  22. requireLogin();
  23. header('Content-Type: application/json');
  24. // ─── helpers ─────────────────────────────────────────────────────────────────
  25. function jsonError(string $message, int $code = 400): never
  26. {
  27. http_response_code($code);
  28. echo json_encode(['success' => false, 'error' => $message]);
  29. exit;
  30. }
  31. function jsonOk(array $data): never
  32. {
  33. echo json_encode(['success' => true, ...$data]);
  34. exit;
  35. }
  36. // ─── request validation ───────────────────────────────────────────────────────
  37. if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
  38. jsonError('Method not allowed', 405);
  39. }
  40. $action = $_POST['action'] ?? 'parse';
  41. if (!in_array($action, ['parse', 'import'], true)) {
  42. jsonError('Invalid action');
  43. }
  44. // ─── file handling ────────────────────────────────────────────────────────────
  45. if (empty($_FILES['file']) || $_FILES['file']['error'] !== UPLOAD_ERR_OK) {
  46. jsonError('No file uploaded or upload error: ' . ($_FILES['file']['error'] ?? 'missing'));
  47. }
  48. $uploadedFile = $_FILES['file'];
  49. $ext = strtolower(pathinfo($uploadedFile['name'], PATHINFO_EXTENSION));
  50. if (!in_array($ext, ['xls', 'xlsx', 'csv', 'ods'], true)) {
  51. jsonError('Unsupported file type. Please upload XLS, XLSX, CSV, or ODS.');
  52. }
  53. // ─── PhpSpreadsheet ───────────────────────────────────────────────────────────
  54. $autoloadPaths = [
  55. __DIR__ . '/../vendor/autoload.php',
  56. __DIR__ . '/../../vendor/autoload.php',
  57. ];
  58. $autoloaded = false;
  59. foreach ($autoloadPaths as $path) {
  60. if (file_exists($path)) {
  61. require_once $path;
  62. $autoloaded = true;
  63. break;
  64. }
  65. }
  66. if (!$autoloaded) {
  67. jsonError('PhpSpreadsheet not installed. Run: composer require phpoffice/phpspreadsheet', 500);
  68. }
  69. use PhpOffice\PhpSpreadsheet\IOFactory;
  70. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  71. try {
  72. $spreadsheet = IOFactory::load($uploadedFile['tmp_name']);
  73. } catch (\Exception $e) {
  74. jsonError('Could not read file: ' . $e->getMessage());
  75. }
  76. $sheet = $spreadsheet->getActiveSheet();
  77. // Convert sheet to a 2-D array (1-indexed rows and cols → 0-indexed)
  78. $rawData = [];
  79. foreach ($sheet->getRowIterator() as $row) {
  80. $cells = [];
  81. foreach ($row->getCellIterator() as $cell) {
  82. $cells[] = trim((string) $cell->getFormattedValue());
  83. }
  84. // Strip trailing empty cells
  85. while ($cells && end($cells) === '') {
  86. array_pop($cells);
  87. }
  88. if ($cells) {
  89. $rawData[] = $cells;
  90. }
  91. }
  92. if (empty($rawData)) {
  93. jsonError('The spreadsheet appears to be empty.');
  94. }
  95. // ─── format detection ─────────────────────────────────────────────────────────
  96. //
  97. // Two layouts found in CSBP lab files:
  98. // ROW-BASED — Row 0 = column headers (LAB_NUMBER, TEXTURE, PH_CACL2 …)
  99. // Rows 1-N = one sample per row.
  100. // TRANSPOSED — Column 0 = row labels (EC 1:5, Total P % …)
  101. // Columns 1-N = one sample per column.
  102. //
  103. // Heuristic: if the first cell of row 0 looks like a short code/identifier
  104. // (< 20 chars, no spaces, all-caps or underscored) → ROW-BASED.
  105. // Otherwise → TRANSPOSED.
  106. function isRowBased(array $rawData): bool
  107. {
  108. $firstCell = $rawData[0][0] ?? '';
  109. // Short, code-like headers signal a row-based layout
  110. return strlen($firstCell) < 25
  111. && !str_contains($firstCell, ' ')
  112. && $firstCell !== '';
  113. }
  114. $transposed = !isRowBased($rawData);
  115. // ─── extract samples ──────────────────────────────────────────────────────────
  116. //
  117. // Returns an array of samples, each sample being an assoc array of
  118. // label → value.
  119. function extractSamplesRowBased(array $rawData): array
  120. {
  121. $headers = $rawData[0];
  122. $samples = [];
  123. for ($r = 1; $r < count($rawData); $r++) {
  124. $row = $rawData[$r];
  125. $sample = [];
  126. foreach ($headers as $c => $header) {
  127. if ($header === '') {
  128. continue;
  129. }
  130. $sample[$header] = $row[$c] ?? '';
  131. }
  132. if (array_filter($sample)) {
  133. $samples[] = $sample;
  134. }
  135. }
  136. return $samples;
  137. }
  138. function extractSamplesTransposed(array $rawData): array
  139. {
  140. // Column 0 = labels; columns 1-N = samples
  141. $labels = array_column($rawData, 0);
  142. $numSamples = max(array_map('count', $rawData)) - 1;
  143. $samples = [];
  144. for ($col = 1; $col <= $numSamples; $col++) {
  145. $sample = [];
  146. foreach ($rawData as $rowIdx => $row) {
  147. $label = $labels[$rowIdx] ?? '';
  148. $value = $row[$col] ?? '';
  149. if ($label !== '' && $value !== '') {
  150. $sample[$label] = $value;
  151. }
  152. }
  153. if (array_filter($sample)) {
  154. $samples[] = $sample;
  155. }
  156. }
  157. return $samples;
  158. }
  159. $samples = $transposed
  160. ? extractSamplesTransposed($rawData)
  161. : extractSamplesRowBased($rawData);
  162. if (empty($samples)) {
  163. jsonError('No samples found in the file.');
  164. }
  165. // ─── action: parse ────────────────────────────────────────────────────────────
  166. // Return a lightweight list of samples so the UI can let the user pick one.
  167. if ($action === 'parse') {
  168. $list = [];
  169. foreach ($samples as $idx => $sample) {
  170. // Try to find a meaningful display label
  171. $labId = $sample['LAB_NUMBER'] ?? $sample['Lab ID (Soil)'] ?? $sample['LAB_ID'] ?? "Sample " . ($idx + 1);
  172. $client = $sample['CLIENT NAME'] ?? $sample['Consultant'] ?? $sample['CUSTNO'] ?? '';
  173. $crop = $sample['CROP'] ?? $sample['Material (manure, sawdust, etc.)'] ?? '';
  174. $pad = $sample['PADDOCK'] ?? $sample['Field Name (Sample ID)'] ?? '';
  175. $list[] = [
  176. 'idx' => $idx,
  177. 'lab_id' => $labId,
  178. 'client' => $client,
  179. 'crop' => $crop,
  180. 'site' => $pad,
  181. ];
  182. }
  183. jsonOk(['samples' => $list, 'count' => count($samples)]);
  184. }
  185. // ─── action: import ───────────────────────────────────────────────────────────
  186. $sampleIdx = (int) ($_POST['sample_idx'] ?? 0);
  187. if ($sampleIdx < 0 || $sampleIdx >= count($samples)) {
  188. jsonError('Invalid sample index.');
  189. }
  190. $sampleData = $samples[$sampleIdx];
  191. // ─── Ollama field mapping ─────────────────────────────────────────────────────
  192. $labJson = json_encode($sampleData, JSON_UNESCAPED_UNICODE);
  193. $prompt = <<<EOT
  194. You are a soil laboratory data mapper. Your only job is to output a JSON object.
  195. Map the LAB DATA below to these TARGET FIELDS. Output ONLY the JSON object — no explanation, no markdown, no code fences.
  196. TARGET FIELDS:
  197. lab_no=Lab reference number/Lab ID
  198. sample_id=Sample identifier/paddock name/field name
  199. site_id=Site identifier/block/customer number
  200. date_sampled=Date sampled as YYYY-MM-DD
  201. texture=Soil texture description
  202. gravel=Gravel % (number only)
  203. colour=Soil colour
  204. ocarbon=Organic carbon % (number only)
  205. omatter=Organic matter % LOI (number only)
  206. ph_cacl2=pH in CaCl2 (number only)
  207. ph_h2o=pH in water (number only)
  208. ec=Electrical conductivity dS/m (number only)
  209. NO3_N=Nitrate-N mg/kg (number only)
  210. NH3_N=Ammonium-N mg/kg (number only)
  211. p_mehlick=Phosphorus Mehlich-3 mg/kg (number only)
  212. p_morgan=Phosphorus extractable mg/kg (number only)
  213. k_morgan=Potassium mg/kg (number only)
  214. ca_morgan=Calcium mg/kg (number only)
  215. mg_morgan=Magnesium mg/kg (number only)
  216. na_morgan=Sodium mg/kg (number only)
  217. s_morgan=Sulphur mg/kg (number only)
  218. b_cacl2=Boron CaCl2 mg/kg (number only)
  219. mn_dtpa=Manganese DTPA mg/kg (number only)
  220. zn_dtpa=Zinc DTPA mg/kg (number only)
  221. fe_dtpa=Iron DTPA mg/kg (number only)
  222. cu_dtpa=Copper DTPA mg/kg (number only)
  223. al=Aluminium mg/kg (number only)
  224. tec=Total Exchange Capacity (number only)
  225. cec=CEC meq/100g (number only)
  226. ca_mehlick3=Calcium Mehlich-3 meq/100g (number only)
  227. mg_mehlick3=Magnesium Mehlich-3 meq/100g (number only)
  228. k_mehlick3=Potassium Mehlich-3 meq/100g (number only)
  229. na_mehlick3=Sodium Mehlich-3 meq/100g (number only)
  230. al_mehlick3=Aluminium Mehlich-3 meq/100g (number only)
  231. LAB DATA: {$labJson}
  232. Rules: only use values present in the lab data. Strip units from numbers. Use null for unmapped fields. Output JSON only.
  233. EOT;
  234. $payload = json_encode([
  235. 'model' => OLLAMA_MODEL,
  236. 'prompt' => $prompt,
  237. 'stream' => false,
  238. 'options' => [
  239. 'temperature' => OLLAMA_TEMPERATURE,
  240. 'num_predict' => 1024,
  241. ],
  242. ]);
  243. $ch = curl_init(OLLAMA_HOST . '/api/generate');
  244. curl_setopt_array($ch, [
  245. CURLOPT_POST => true,
  246. CURLOPT_POSTFIELDS => $payload,
  247. CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
  248. CURLOPT_RETURNTRANSFER => true,
  249. CURLOPT_TIMEOUT => OLLAMA_TIMEOUT,
  250. CURLOPT_CONNECTTIMEOUT => 5,
  251. ]);
  252. $response = curl_exec($ch);
  253. $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  254. $curlErr = curl_error($ch);
  255. curl_close($ch);
  256. if ($curlErr || $httpCode !== 200) {
  257. $mapped = staticFieldMap($sampleData);
  258. $warning = $curlErr ?: "Ollama HTTP {$httpCode}";
  259. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI unavailable: ' . $warning]);
  260. }
  261. $ollamaData = json_decode($response, true);
  262. $rawText = trim($ollamaData['response'] ?? '');
  263. if ($rawText === '') {
  264. $mapped = staticFieldMap($sampleData);
  265. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'Ollama returned empty response']);
  266. }
  267. // Strip any markdown code fences the model might wrap around the JSON
  268. $rawText = preg_replace('/^```(?:json)?\s*/i', '', $rawText);
  269. $rawText = preg_replace('/\s*```$/m', '', $rawText);
  270. // Extract the first JSON object if the model added commentary
  271. if (preg_match('/\{[\s\S]+\}/', $rawText, $m)) {
  272. $rawText = $m[0];
  273. }
  274. $mapped = json_decode($rawText, true);
  275. if (!is_array($mapped)) {
  276. $mapped = staticFieldMap($sampleData);
  277. jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI returned unparseable JSON']);
  278. }
  279. // Remove null/empty values
  280. $mapped = array_filter($mapped, fn($v) => $v !== null && $v !== '');
  281. jsonOk(['fields' => $mapped, 'method' => 'ai']);
  282. // ─── static fallback mapper ───────────────────────────────────────────────────
  283. // Simple keyword-based mapping used when the AI is unavailable.
  284. function staticFieldMap(array $data): array
  285. {
  286. $map = [
  287. // lab_no
  288. 'lab_no' => ['LAB_NUMBER', 'Lab ID (Soil)', 'LAB_ID'],
  289. // sample / site
  290. 'sample_id' => ['PADDOCK', 'Field Name (Sample ID)', 'PADDOCK_NAME'],
  291. 'site_id' => ['CUSTNO', 'Lab performing testing'],
  292. // physical
  293. 'texture' => ['TEXTURE'],
  294. 'gravel' => ['GRAVEL'],
  295. 'colour' => ['COLOUR', 'COLOR'],
  296. // chemical
  297. 'ocarbon' => ['ORGCARBON', 'Organic Carbon %', 'Total Organic Carbon %'],
  298. 'omatter' => ['Total Organic Matter (L.O.I) %', 'Organic Matter %'],
  299. 'ph_cacl2' => ['PH_CACL2', 'PH 1:5 (CaCl2)', 'pH CaCl2', 'ph_cacl2'],
  300. 'ph_h2o' => ['PH_H2O', 'pH 1:5 (H2O)', 'pH Water'],
  301. 'ec' => ['CONDUCTY', 'EC 1:5', 'EC'],
  302. // nutrients
  303. 'NO3_N' => ['NITRATE', 'Nitrate ppm', 'Nitrate-N', 'NO3_N'],
  304. 'NH3_N' => ['NAMMONIUM', 'Ammonium', 'NH4_N'],
  305. 'p_morgan' => ['PHOS', 'Total P %', 'Phosphorus'],
  306. 'k_morgan' => ['POTASSIUM', 'Total K %', 'Potassium'],
  307. 'ca_morgan' => ['EXC_CA', 'Total Ca %', 'Calcium'],
  308. 'mg_morgan' => ['EXC_MG', 'Total Mg %', 'Magnesium'],
  309. 'na_morgan' => ['EXC_NA', 'Total Na %', 'Sodium'],
  310. 's_morgan' => ['SULPHUR', 'Total S %', 'Sulphur'],
  311. // micronutrients
  312. 'b_cacl2' => ['BORON_HOT', 'Total B ppm', 'Boron'],
  313. 'mn_dtpa' => ['DTPA_MN', 'EDTA_MN', 'Total Mn ppm', 'Manganese'],
  314. 'zn_dtpa' => ['DTPA_ZN', 'EDTA_ZN', 'Total Zn ppm', 'Zinc'],
  315. 'fe_dtpa' => ['DTPA_FE', 'EDTA_FE', 'Total Fe ppm', 'Iron', 'IRON'],
  316. 'cu_dtpa' => ['DTPA_CU', 'EDTA_CU', 'Total Cu ppm', 'Copper'],
  317. 'al' => ['ALUM_CACL2', 'EXC_AL', 'Aluminium'],
  318. // base saturation
  319. 'cec' => ['CEC', 'COND', 'SAT_COND'],
  320. 'ca_mehlick3' => ['SAT_Ca', 'SAT_CA'],
  321. 'mg_mehlick3' => ['SAT_Mg', 'SAT_MG'],
  322. 'k_mehlick3' => ['SAT_K'],
  323. 'na_mehlick3' => ['SAT_Na', 'SAT_NA'],
  324. ];
  325. $result = [];
  326. foreach ($map as $dbField => $labKeys) {
  327. foreach ($labKeys as $labKey) {
  328. // Case-insensitive search
  329. foreach ($data as $k => $v) {
  330. if (strcasecmp($k, $labKey) === 0 && $v !== '') {
  331. $result[$dbField] = $v;
  332. break 2;
  333. }
  334. }
  335. }
  336. }
  337. return $result;
  338. }