benjamin.harris
/
tas_councils


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
							<?php
/**
 * lib/llm.php
 *
 * Unified LLM inference helper.
 * Primary:  llama.cpp server (LLAMACPP_HOST) — /completion + /v1/embeddings
 * Fallback: Ollama            (OLLAMA_HOST)   — /api/generate + /api/embed
 *
 * Primary backend is llama-swap, which proxies multiple llama.cpp instances
 * and routes by model name via the OpenAI-compatible /v1 API.
 *
 * Public API:
 *   llmGenerate(string $prompt, array $options = []): string
 *   llmEmbed(string $text): ?array
 *
 * $options keys (all optional):
 *   temperature    float  default LLAMACPP_TEMPERATURE
 *   num_predict    int    default 2048  (maps to max_tokens)
 *   num_ctx        int    default 6144  (Ollama only — ignored by llama-swap)
 *   repeat_penalty float  default 1.1  (maps to frequency_penalty approx)
 */

require_once __DIR__ . '/../config/ai.php';

// ── Public functions ──────────────────────────────────────────────────────────

/**
 * Generate text from a prompt.
 * Tries llama.cpp first; falls back to Ollama on connection failure or non-200.
 *
 * @throws RuntimeException when both backends fail
 */
function llmGenerate(string $prompt, array $options = []): string
{
    $text = _llamacppGenerate($prompt, $options);
    if ($text !== null) {
        return $text;
    }

    error_log('[llm] llama.cpp unavailable — falling back to Ollama');

    $text = _ollamaGenerate($prompt, $options);
    if ($text !== null) {
        return $text;
    }

    throw new RuntimeException('All LLM backends unavailable');
}

/**
 * Embed text into a float vector.
 * Tries llama.cpp /v1/embeddings first; falls back to Ollama /api/embed.
 * Returns null only when both backends fail.
 */
function llmEmbed(string $text): ?array
{
    $text = substr($text, 0, 2000);

    // Only try llama-swap for embeddings if an embed model is configured
    if (LLAMACPP_EMBED_MODEL !== '') {
        $emb = _llamacppEmbed($text);
        if ($emb !== null) {
            return $emb;
        }
        error_log('[llm] llama-swap embed unavailable — falling back to Ollama');
    }

    return _ollamaEmbed($text);
}

// ── llama.cpp backend ─────────────────────────────────────────────────────────

function _llamacppGenerate(string $prompt, array $options): ?string
{
    // llama-swap uses the OpenAI chat completions endpoint, routed by model name
    $payload = json_encode([
        'model'       => LLAMACPP_MODEL,
        'messages'    => [['role' => 'user', 'content' => $prompt]],
        'max_tokens'  => $options['num_predict']    ?? 2048,
        'temperature' => $options['temperature']    ?? LLAMACPP_TEMPERATURE,
        'top_p'       => $options['top_p']          ?? LLAMACPP_TOP_P,
        'top_k'       => $options['top_k']          ?? LLAMACPP_TOP_K,
        'repeat_penalty' => $options['repeat_penalty'] ?? 1.1,
        'stop'        => $options['stop']           ?? [],
        'stream'      => false,
    ]);

    $ch = curl_init(LLAMACPP_HOST . '/v1/chat/completions');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => $payload,
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => LLAMACPP_TIMEOUT,
        CURLOPT_CONNECTTIMEOUT => 3,
    ]);

    $resp = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $err  = curl_error($ch);
    curl_close($ch);

    if ($err || $resp === false || $code !== 200) {
        error_log('[llm] llama-swap generate: ' . ($err ?: "HTTP $code"));
        return null;
    }

    $data = json_decode($resp, true);
    $text = trim($data['choices'][0]['message']['content'] ?? '');
    return $text !== '' ? $text : null;
}

function _llamacppEmbed(string $text): ?array
{
    // llama-swap routes embeddings by model name, same as completions
    $payload = json_encode(['model' => LLAMACPP_EMBED_MODEL, 'input' => $text]);

    $ch = curl_init(LLAMACPP_HOST . '/v1/embeddings');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => $payload,
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => 15,
        CURLOPT_CONNECTTIMEOUT => 3,
    ]);

    $resp = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $err  = curl_error($ch);
    curl_close($ch);

    if ($err || $resp === false || $code !== 200) {
        error_log('[llm] llama.cpp embed: ' . ($err ?: "HTTP $code"));
        return null;
    }

    $data = json_decode($resp, true);
    $emb  = $data['data'][0]['embedding'] ?? null;
    return (is_array($emb) && count($emb) > 0) ? $emb : null;
}

// ── Ollama backend ────────────────────────────────────────────────────────────

function _ollamaGenerate(string $prompt, array $options): ?string
{
    $payload = json_encode([
        'model'  => OLLAMA_MODEL,
        'prompt' => $prompt,
        'stream' => false,
        'options' => [
            'temperature'    => $options['temperature']    ?? 0.3,
            'num_predict'    => $options['num_predict']    ?? 2048,
            'num_ctx'        => $options['num_ctx']        ?? 6144,
            'repeat_penalty' => $options['repeat_penalty'] ?? 1.1,
            'keep_alive'     => -1,
        ],
    ]);

    $ch = curl_init(OLLAMA_HOST . '/api/generate');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => $payload,
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => OLLAMA_TIMEOUT,
        CURLOPT_CONNECTTIMEOUT => 5,
    ]);

    $resp = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $err  = curl_error($ch);
    curl_close($ch);

    if ($err || $resp === false || $code !== 200) {
        error_log('[llm] Ollama generate: ' . ($err ?: "HTTP $code"));
        return null;
    }

    $data = json_decode($resp, true);
    $text = trim($data['response'] ?? '');
    return $text !== '' ? $text : null;
}

function _ollamaEmbed(string $text): ?array
{
    // Try /api/embed (Ollama >= 0.1.26) first
    $ch = curl_init(OLLAMA_HOST . '/api/embed');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'input' => $text]),
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => 15,
        CURLOPT_CONNECTTIMEOUT => 5,
    ]);
    $resp = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($resp && $code === 200) {
        $data = json_decode($resp, true);
        $emb  = $data['embeddings'][0] ?? null;
        if (is_array($emb) && count($emb) > 0) return $emb;
    }

    // Fallback: legacy /api/embeddings
    $ch = curl_init(OLLAMA_HOST . '/api/embeddings');
    curl_setopt_array($ch, [
        CURLOPT_POST           => true,
        CURLOPT_POSTFIELDS     => json_encode(['model' => EMBED_MODEL, 'prompt' => $text]),
        CURLOPT_HTTPHEADER     => ['Content-Type: application/json'],
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_TIMEOUT        => 15,
        CURLOPT_CONNECTTIMEOUT => 5,
    ]);
    $resp2 = curl_exec($ch);
    $code2 = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($resp2 && $code2 === 200) {
        $data2 = json_decode($resp2, true);
        $emb2  = $data2['embedding'] ?? null;
        if (is_array($emb2) && count($emb2) > 0) return $emb2;
    }

    error_log('[llm] All embed backends failed');
    return null;
}