| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464 |
- <?php
- /**
- * controllers/soilImportController.php
- *
- * Handles XLS/XLSX/CSV upload from soil labs, parses the file with
- * PhpSpreadsheet, then uses the local Ollama LLM to map lab-specific
- * column headers to the soil_records database fields.
- *
- * POST /controllers/soilImportController.php
- * Accepts multipart/form-data with:
- * file — the uploaded spreadsheet
- * action — "parse" → return list of samples found in the file
- * "import" → return mapped field values for one sample
- * sample_idx — (import only) 0-based index of the sample to import
- */
- require_once __DIR__ . '/../config/database.php';
- require_once __DIR__ . '/../config/ai.php';
- require_once __DIR__ . '/../lib/auth.php';
- if (session_status() === PHP_SESSION_NONE) {
- session_start();
- }
- requireLogin();
- header('Content-Type: application/json');
- // ─── helpers ─────────────────────────────────────────────────────────────────
- function jsonError(string $message, int $code = 400): never
- {
- http_response_code($code);
- echo json_encode(['success' => false, 'error' => $message]);
- exit;
- }
- function jsonOk(array $data): never
- {
- echo json_encode(['success' => true, ...$data]);
- exit;
- }
- // ─── request validation ───────────────────────────────────────────────────────
- if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
- jsonError('Method not allowed', 405);
- }
- $action = $_POST['action'] ?? 'parse';
- if (!in_array($action, ['parse', 'import'], true)) {
- jsonError('Invalid action');
- }
- // ─── file handling ────────────────────────────────────────────────────────────
- if (empty($_FILES['file']) || $_FILES['file']['error'] !== UPLOAD_ERR_OK) {
- jsonError('No file uploaded or upload error: ' . ($_FILES['file']['error'] ?? 'missing'));
- }
- $uploadedFile = $_FILES['file'];
- $ext = strtolower(pathinfo($uploadedFile['name'], PATHINFO_EXTENSION));
- if (!in_array($ext, ['xls', 'xlsx', 'csv', 'ods'], true)) {
- jsonError('Unsupported file type. Please upload XLS, XLSX, CSV, or ODS.');
- }
- // ─── PhpSpreadsheet ───────────────────────────────────────────────────────────
- $autoloadPaths = [
- __DIR__ . '/../vendor/autoload.php',
- __DIR__ . '/../../vendor/autoload.php',
- ];
- $autoloaded = false;
- foreach ($autoloadPaths as $path) {
- if (file_exists($path)) {
- require_once $path;
- $autoloaded = true;
- break;
- }
- }
- if (!$autoloaded) {
- jsonError('PhpSpreadsheet not installed. Run: composer require phpoffice/phpspreadsheet', 500);
- }
- use PhpOffice\PhpSpreadsheet\IOFactory;
- use PhpOffice\PhpSpreadsheet\Spreadsheet;
- try {
- $spreadsheet = IOFactory::load($uploadedFile['tmp_name']);
- } catch (\Exception $e) {
- jsonError('Could not read file: ' . $e->getMessage());
- }
- $sheet = $spreadsheet->getActiveSheet();
- // Convert sheet to a 2-D array (1-indexed rows and cols → 0-indexed)
- $rawData = [];
- foreach ($sheet->getRowIterator() as $row) {
- $cells = [];
- foreach ($row->getCellIterator() as $cell) {
- $cells[] = trim((string) $cell->getFormattedValue());
- }
- // Strip trailing empty cells
- while ($cells && end($cells) === '') {
- array_pop($cells);
- }
- if ($cells) {
- $rawData[] = $cells;
- }
- }
- if (empty($rawData)) {
- jsonError('The spreadsheet appears to be empty.');
- }
- // ─── format detection ─────────────────────────────────────────────────────────
- //
- // Two layouts found in CSBP lab files:
- //
- // TRANSPOSED (lab card) — Column 0 = row labels ("EC 1:5", "Total P %", …)
- // Columns 1-N = one sample each.
- // Example: SOIL CONTROL XNS06189.xls
- //
- // ROW-BASED (report) — One row = column headers; subsequent rows = samples.
- // May have 1-3 title/subtitle rows above the headers.
- // Example: S-C Soil Tests 2006.xls, YOS06 42-48.xlsx
- //
- // Detection strategy:
- // 1. Score column-0 values for soil-chemistry label patterns (units, element
- // names, "1:5", "ppm", etc.). ≥2 matches → transposed.
- // 2. Otherwise scan the first 10 rows for a "header row" — the row that best
- // matches known CSBP column-code keywords. Everything above it is a title.
- /**
- * Returns ['transposed' => bool, 'headerRow' => int]
- */
- function detectFormat(array $rawData): array
- {
- // Phrases that appear in column-0 of a transposed lab card
- $transposedSignals = [
- '1:5', 'total p', 'total k', 'total ca', 'total mg', 'total na',
- 'total s', 'total n', 'total b', 'total zn', 'total mn', 'total fe',
- 'total cu', 'total cl', 'organic matter', 'organic carbon',
- 'lab id', 'lab performing', 'field name', 'nitrate ppm',
- 'ph 1:5', 'moisture %', 'consultant',
- ];
- // Phrases that appear in the header row of a row-based file
- $rowBasedSignals = [
- 'lab_number', 'custno', 'paddock', 'ph_cacl2', 'ph_h2o',
- 'dtpa_cu', 'dtpa_zn', 'dtpa_mn', 'dtpa_fe', 'conducty',
- 'orgcarbon', 'nitrate', 'nammonium', 'texture', 'gravel',
- 'exc_ca', 'exc_mg', 'exc_na', 'exc_k', 'alum_cacl2',
- 'boron_hot', 'client name', 'lab number', 'sat_ca', 'sat_mg',
- 'sat_k', 'sat_na', 'crop', 'sp%',
- ];
- // Step 1: score column-0 values for transposed signals
- $transposedScore = 0;
- foreach (array_slice($rawData, 0, 20) as $row) {
- $cell = strtolower($row[0] ?? '');
- if ($cell === '') {
- continue;
- }
- foreach ($transposedSignals as $signal) {
- if (str_contains($cell, $signal)) {
- $transposedScore++;
- break;
- }
- }
- }
- if ($transposedScore >= 2) {
- return ['transposed' => true, 'headerRow' => 0];
- }
- // Step 2: find the best header row in a row-based file
- $bestRow = 0;
- $bestScore = 0;
- for ($i = 0; $i < min(10, count($rawData)); $i++) {
- $score = 0;
- foreach ($rawData[$i] as $cell) {
- $cell = strtolower(trim($cell));
- if ($cell === '') {
- continue;
- }
- foreach ($rowBasedSignals as $signal) {
- if (str_contains($cell, $signal)) {
- $score++;
- break;
- }
- }
- }
- if ($score > $bestScore) {
- $bestScore = $score;
- $bestRow = $i;
- }
- }
- return ['transposed' => false, 'headerRow' => $bestRow];
- }
- // ─── extract samples ──────────────────────────────────────────────────────────
- function extractSamplesRowBased(array $rawData, int $headerRow): array
- {
- $headers = $rawData[$headerRow];
- $samples = [];
- for ($r = $headerRow + 1; $r < count($rawData); $r++) {
- $row = $rawData[$r];
- $sample = [];
- foreach ($headers as $c => $header) {
- if ($header === '') {
- continue;
- }
- $sample[$header] = $row[$c] ?? '';
- }
- if (array_filter($sample)) {
- $samples[] = $sample;
- }
- }
- return $samples;
- }
- function extractSamplesTransposed(array $rawData): array
- {
- // Column 0 = labels; columns 1-N = samples.
- // Count sample columns from the widest row.
- $labels = array_column($rawData, 0);
- $maxCols = max(array_map('count', $rawData));
- $samples = [];
- for ($col = 1; $col < $maxCols; $col++) {
- $sample = [];
- foreach ($rawData as $rowIdx => $row) {
- $label = trim($labels[$rowIdx] ?? '');
- $value = trim($row[$col] ?? '');
- if ($label !== '' && $value !== '') {
- $sample[$label] = $value;
- }
- }
- // Only keep columns that have at least a few populated cells
- if (count(array_filter($sample)) >= 3) {
- $samples[] = $sample;
- }
- }
- return $samples;
- }
- $fmt = detectFormat($rawData);
- $samples = $fmt['transposed']
- ? extractSamplesTransposed($rawData)
- : extractSamplesRowBased($rawData, $fmt['headerRow']);
- if (empty($samples)) {
- jsonError('No samples found in the file.');
- }
- // ─── action: parse ────────────────────────────────────────────────────────────
- // Return a lightweight list of samples so the UI can let the user pick one.
- if ($action === 'parse') {
- $list = [];
- foreach ($samples as $idx => $sample) {
- // Try to find a meaningful display label
- $labId = $sample['LAB_NUMBER'] ?? $sample['Lab ID (Soil)'] ?? $sample['LAB_ID'] ?? "Sample " . ($idx + 1);
- $client = $sample['CLIENT NAME'] ?? $sample['Consultant'] ?? $sample['CUSTNO'] ?? '';
- $crop = $sample['CROP'] ?? $sample['Material (manure, sawdust, etc.)'] ?? '';
- $pad = $sample['PADDOCK'] ?? $sample['Field Name (Sample ID)'] ?? '';
- $list[] = [
- 'idx' => $idx,
- 'lab_id' => $labId,
- 'client' => $client,
- 'crop' => $crop,
- 'site' => $pad,
- ];
- }
- jsonOk([
- 'samples' => $list,
- 'count' => count($samples),
- 'format' => $fmt['transposed'] ? 'transposed' : 'row-based',
- 'header_row' => $fmt['headerRow'] ?? 0,
- ]);
- }
- // ─── action: import ───────────────────────────────────────────────────────────
- $sampleIdx = (int) ($_POST['sample_idx'] ?? 0);
- if ($sampleIdx < 0 || $sampleIdx >= count($samples)) {
- jsonError('Invalid sample index.');
- }
- $sampleData = $samples[$sampleIdx];
- // ─── Ollama field mapping ─────────────────────────────────────────────────────
- $labJson = json_encode($sampleData, JSON_UNESCAPED_UNICODE);
- $prompt = <<<EOT
- You are a soil laboratory data mapper. Your only job is to output a JSON object.
- Map the LAB DATA below to these TARGET FIELDS. Output ONLY the JSON object — no explanation, no markdown, no code fences.
- TARGET FIELDS:
- lab_no=Lab reference number/Lab ID
- sample_id=Sample identifier/paddock name/field name
- site_id=Site identifier/block/customer number
- date_sampled=Date sampled as YYYY-MM-DD
- texture=Soil texture description
- gravel=Gravel % (number only)
- colour=Soil colour
- ocarbon=Organic carbon % (number only)
- omatter=Organic matter % LOI (number only)
- ph_cacl2=pH in CaCl2 (number only)
- ph_h2o=pH in water (number only)
- ec=Electrical conductivity dS/m (number only)
- NO3_N=Nitrate-N mg/kg (number only)
- NH3_N=Ammonium-N mg/kg (number only)
- p_mehlick=Phosphorus Mehlich-3 mg/kg (number only)
- p_morgan=Phosphorus extractable mg/kg (number only)
- k_morgan=Potassium mg/kg (number only)
- ca_morgan=Calcium mg/kg (number only)
- mg_morgan=Magnesium mg/kg (number only)
- na_morgan=Sodium mg/kg (number only)
- s_morgan=Sulphur mg/kg (number only)
- b_cacl2=Boron CaCl2 mg/kg (number only)
- mn_dtpa=Manganese DTPA mg/kg (number only)
- zn_dtpa=Zinc DTPA mg/kg (number only)
- fe_dtpa=Iron DTPA mg/kg (number only)
- cu_dtpa=Copper DTPA mg/kg (number only)
- al=Aluminium mg/kg (number only)
- tec=Total Exchange Capacity (number only)
- cec=CEC meq/100g (number only)
- ca_mehlick3=Calcium Mehlich-3 meq/100g (number only)
- mg_mehlick3=Magnesium Mehlich-3 meq/100g (number only)
- k_mehlick3=Potassium Mehlich-3 meq/100g (number only)
- na_mehlick3=Sodium Mehlich-3 meq/100g (number only)
- al_mehlick3=Aluminium Mehlich-3 meq/100g (number only)
- LAB DATA: {$labJson}
- Rules: only use values present in the lab data. Strip units from numbers. Use null for unmapped fields. Output JSON only.
- EOT;
- $payload = json_encode([
- 'model' => OLLAMA_MODEL,
- 'prompt' => $prompt,
- 'stream' => false,
- 'options' => [
- 'temperature' => OLLAMA_TEMPERATURE,
- 'num_predict' => 1024,
- ],
- ]);
- $ch = curl_init(OLLAMA_HOST . '/api/generate');
- curl_setopt_array($ch, [
- CURLOPT_POST => true,
- CURLOPT_POSTFIELDS => $payload,
- CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_TIMEOUT => OLLAMA_TIMEOUT,
- CURLOPT_CONNECTTIMEOUT => 5,
- ]);
- $response = curl_exec($ch);
- $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- $curlErr = curl_error($ch);
- curl_close($ch);
- if ($curlErr || $httpCode !== 200) {
- $mapped = staticFieldMap($sampleData);
- $warning = $curlErr ?: "Ollama HTTP {$httpCode}";
- jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI unavailable: ' . $warning]);
- }
- $ollamaData = json_decode($response, true);
- $rawText = trim($ollamaData['response'] ?? '');
- if ($rawText === '') {
- $mapped = staticFieldMap($sampleData);
- jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'Ollama returned empty response']);
- }
- // Strip any markdown code fences the model might wrap around the JSON
- $rawText = preg_replace('/^```(?:json)?\s*/i', '', $rawText);
- $rawText = preg_replace('/\s*```$/m', '', $rawText);
- // Extract the first JSON object if the model added commentary
- if (preg_match('/\{[\s\S]+\}/', $rawText, $m)) {
- $rawText = $m[0];
- }
- $mapped = json_decode($rawText, true);
- if (!is_array($mapped)) {
- $mapped = staticFieldMap($sampleData);
- jsonOk(['fields' => $mapped, 'method' => 'static', 'warning' => 'AI returned unparseable JSON']);
- }
- // Remove null/empty values
- $mapped = array_filter($mapped, fn($v) => $v !== null && $v !== '');
- jsonOk(['fields' => $mapped, 'method' => 'ai']);
- // ─── static fallback mapper ───────────────────────────────────────────────────
- // Simple keyword-based mapping used when the AI is unavailable.
- function staticFieldMap(array $data): array
- {
- $map = [
- // lab_no
- 'lab_no' => ['LAB_NUMBER', 'Lab ID (Soil)', 'LAB_ID'],
- // sample / site
- 'sample_id' => ['PADDOCK', 'Field Name (Sample ID)', 'PADDOCK_NAME'],
- 'site_id' => ['CUSTNO', 'Lab performing testing'],
- // physical
- 'texture' => ['TEXTURE'],
- 'gravel' => ['GRAVEL'],
- 'colour' => ['COLOUR', 'COLOR'],
- // chemical
- 'ocarbon' => ['ORGCARBON', 'Organic Carbon %', 'Total Organic Carbon %'],
- 'omatter' => ['Total Organic Matter (L.O.I) %', 'Organic Matter %'],
- 'ph_cacl2' => ['PH_CACL2', 'PH 1:5 (CaCl2)', 'pH CaCl2', 'ph_cacl2'],
- 'ph_h2o' => ['PH_H2O', 'pH 1:5 (H2O)', 'pH Water'],
- 'ec' => ['CONDUCTY', 'EC 1:5', 'EC'],
- // nutrients
- 'NO3_N' => ['NITRATE', 'Nitrate ppm', 'Nitrate-N', 'NO3_N'],
- 'NH3_N' => ['NAMMONIUM', 'Ammonium', 'NH4_N'],
- 'p_morgan' => ['PHOS', 'Total P %', 'Phosphorus'],
- 'k_morgan' => ['POTASSIUM', 'Total K %', 'Potassium'],
- 'ca_morgan' => ['EXC_CA', 'Total Ca %', 'Calcium'],
- 'mg_morgan' => ['EXC_MG', 'Total Mg %', 'Magnesium'],
- 'na_morgan' => ['EXC_NA', 'Total Na %', 'Sodium'],
- 's_morgan' => ['SULPHUR', 'Total S %', 'Sulphur'],
- // micronutrients
- 'b_cacl2' => ['BORON_HOT', 'Total B ppm', 'Boron'],
- 'mn_dtpa' => ['DTPA_MN', 'EDTA_MN', 'Total Mn ppm', 'Manganese'],
- 'zn_dtpa' => ['DTPA_ZN', 'EDTA_ZN', 'Total Zn ppm', 'Zinc'],
- 'fe_dtpa' => ['DTPA_FE', 'EDTA_FE', 'Total Fe ppm', 'Iron', 'IRON'],
- 'cu_dtpa' => ['DTPA_CU', 'EDTA_CU', 'Total Cu ppm', 'Copper'],
- 'al' => ['ALUM_CACL2', 'EXC_AL', 'Aluminium'],
- // base saturation
- 'cec' => ['CEC', 'COND', 'SAT_COND'],
- 'ca_mehlick3' => ['SAT_Ca', 'SAT_CA'],
- 'mg_mehlick3' => ['SAT_Mg', 'SAT_MG'],
- 'k_mehlick3' => ['SAT_K'],
- 'na_mehlick3' => ['SAT_Na', 'SAT_NA'],
- ];
- $result = [];
- foreach ($map as $dbField => $labKeys) {
- foreach ($labKeys as $labKey) {
- // Case-insensitive search
- foreach ($data as $k => $v) {
- if (strcasecmp($k, $labKey) === 0 && $v !== '') {
- $result[$dbField] = $v;
- break 2;
- }
- }
- }
- }
- return $result;
- }
|