hai 2 meses · 6353a6b016
--- a/backend/app.py
+++ b/backend/app.py
@@ -196,32 +196,26 @@ def ollama_embed(text: str) -> List[float]:
 
															         raise HTTPException(status_code=502, detail="Embedding service returned unexpected response")
														
 
															     return data["embedding"]
														
 
															-def ollama_chat(prompt: str) -> str:
														
 
															-    """
														
 
															-    Send a prompt to Ollama and return the generated text.
														
 
															-
														
 
															-    keep_alive MUST be a top-level key — putting it inside options{} causes
														
 
															-    Ollama to silently ignore it and unload the model between requests.
														
 
															+_OLLAMA_GENERATE_BODY = lambda prompt, stream: {
														
 
															+    "model": CHAT_MODEL,
														
 
															+    "prompt": prompt,
														
 
															+    "stream": stream,
														
 
															+    "options": {
														
 
															+        "num_ctx": OLLAMA_NUM_CTX,
														
 
															+        "num_predict": OLLAMA_NUM_PREDICT,
														
 
															+        "temperature": OLLAMA_TEMPERATURE,
														
 
															+        "top_p": 0.9,
														
 
															+        "repeat_penalty": 1.1,
														
 
															+    },
														
 
															+    "keep_alive": -1,   # keep model resident in VRAM — MUST be top-level, not inside options
														
 
															+}
														
 
															-    num_ctx is fixed at 6144. Changing it between requests forces Ollama to
														
 
															-    reload the model (KV cache is resized), adding ~3–5 s of cold-start latency.
														
 
															-    """
														
 
															+def ollama_chat(prompt: str) -> str:
														
 
															+    """Send a prompt to Ollama and return the full generated text (non-streaming)."""
														
 
															     try:
														
 
															         r = requests.post(
														
 
															             f"{OLLAMA_URL}/api/generate",
														
 
															-            json={
														
 
															-              "model": CHAT_MODEL,
														
 
															-              "prompt": prompt,
														
 
															-              "stream": False,
														
 
															-              "options": {
														
 
															-                "num_ctx": OLLAMA_NUM_CTX,
														
 
															-                "num_predict": OLLAMA_NUM_PREDICT,
														
 
															-                "temperature": OLLAMA_TEMPERATURE,
														
 
															-                "top_p": 0.9,
														
 
															-                "repeat_penalty": 1.1,
														
 
															-              },
														
 
															-              "keep_alive": -1,  # keep model resident in VRAM between requests
														
 
															-            },
														
 
															+            json=_OLLAMA_GENERATE_BODY(prompt, False),
														
 
															             timeout=180
														
 
															         )
														
 
															         r.raise_for_status()
														
@@ -234,8 +228,45 @@ def ollama_chat(prompt: str) -> str:
 
															     except requests.HTTPError as e:
														
 
															         logger.error("Ollama chat HTTP %s: %s", e.response.status_code, e.response.text[:200])
														
 
															         raise HTTPException(status_code=502, detail="LLM service error")
														
 
															-    data = r.json()
														
 
															-    return data.get("response", "").strip()
														
 
															+    return r.json().get("response", "").strip()
														
 
															+
														
 
															+
														
 
															+def ollama_chat_stream(prompt: str):
														
 
															+    """
														
 
															+    Yield raw text tokens from Ollama using streaming mode.
														
 
															+    Each yielded value is a string (one or more characters).
														
 
															+    Raises HTTPException on connection/HTTP errors before the first token.
														
 
															+    """
														
 
															+    try:
														
 
															+        r = requests.post(
														
 
															+            f"{OLLAMA_URL}/api/generate",
														
 
															+            json=_OLLAMA_GENERATE_BODY(prompt, True),
														
 
															+            stream=True,
														
 
															+            timeout=180
														
 
															+        )
														
 
															+        r.raise_for_status()
														
 
															+    except requests.Timeout:
														
 
															+        logger.error("Ollama stream timeout (url=%s model=%s)", OLLAMA_URL, CHAT_MODEL)
														
 
															+        raise HTTPException(status_code=503, detail="LLM service timed out")
														
 
															+    except requests.ConnectionError:
														
 
															+        logger.error("Ollama stream connection error (url=%s)", OLLAMA_URL)
														
 
															+        raise HTTPException(status_code=503, detail="LLM service unavailable")
														
 
															+    except requests.HTTPError as e:
														
 
															+        logger.error("Ollama stream HTTP %s: %s", e.response.status_code, e.response.text[:200])
														
 
															+        raise HTTPException(status_code=502, detail="LLM service error")
														
 
															+
														
 
															+    for line in r.iter_lines():
														
 
															+        if not line:
														
 
															+            continue
														
 
															+        try:
														
 
															+            chunk = json.loads(line)
														
 
															+        except json.JSONDecodeError:
														
 
															+            continue
														
 
															+        token = chunk.get("response", "")
														
 
															+        if token:
														
 
															+            yield token
														
 
															+        if chunk.get("done"):
														
 
															+            break
														
 
															 def _scroll_points(collection: str, qfilter=None, include_vector: bool=False, page_size: int=200):
														
 
															     """
														
@@ -661,7 +692,7 @@ def _allowed(p: dict, scope: str, cslug: Optional[str]) -> bool:
 
															         return corp == "tps" or (corp == "lps" and cslug and council == cslug)
														
 
															     return True
														
 
															-def do_ask(
														
 
															+def _prepare_ask(
														
 
															     query: str,
														
 
															     top_k: int = 10,
														
 
															     council: Optional[str] = None,
														
@@ -670,15 +701,16 @@ def do_ask(
 
															     source_contains: Optional[str] = None,
														
 
															     scope: str = "state_plus_local",
														
 
															     section_id: Optional[str] = None,
														
 
															-    context_only: bool = False,
														
 
															-):
														
 
															-    top_k = max(1, min(top_k, 30))  # clamp: at least 1, at most 30
														
 
															+) -> tuple:
														
 
															+    """
														
 
															+    Embed the query, run Qdrant retrieval, and build the LLM prompt.
														
 
															+    Returns (prompt, all_sources) — does NOT call the LLM.
														
 
															+    Shared by do_ask() and the streaming endpoint.
														
 
															+    """
														
 
															+    top_k = max(1, min(top_k, 30))
														
 
															     vec = ollama_embed(query)
														
 
															     cslug = slug(council) if council else None
														
 
															-    # Build the list of (section_heading, qdrant_filter) pairs based on scope.
														
 
															-    # Each pair is searched independently so we can control the chunk budget
														
 
															-    # per corpus — avoids TPS drowning out LPS results or vice versa.
														
 
															     scopes: List[Tuple[str, qmodels.Filter]] = []
														
 
															     if scope in ("state_only", "state_plus_local", "any"):
														
 
															         scopes.append(("Tasmanian Planning Scheme (SPP)", filter_tps()))
														
@@ -689,11 +721,8 @@ def do_ask(
 
															     if include_standards:
														
 
															         scopes.append(("Australian Standards (AS)", filter_as()))
														
 
															-    # Apply additional filename filter if requested (AND)
														
 
															     scopes = [(name, with_source_contains(flt, source_contains)) for name, flt in scopes]
														
 
															-    # Divide top_k across scopes: SPP and LPS each get ~1/3, the remainder
														
 
															-    # is split evenly across any extra corpora (NCC, AS).
														
 
															     per_spp = max(3, top_k // 3) if any(n.startswith("Tasmanian Planning Scheme") for n, _ in scopes) else 0
														
 
															     per_lps = max(3, top_k // 3) if any(n.startswith("Local Provisions Schedule") for n, _ in scopes) else 0
														
 
															     remaining = max(1, top_k - (per_spp + per_lps))
														
@@ -716,23 +745,15 @@ def do_ask(
 
															         if lim <= 0:
														
 
															             continue
														
 
															         hits = q_search(vec, flt, lim)
														
 
															-
														
 
															-        # Guardrail: drop any hit that violates scope/council
														
 
															         hits = [h for h in hits if _allowed(h.payload or {}, scope, cslug)]
														
 
															-
														
 
															         blocks, sources = render_blocks(hits)
														
 
															         sections.append((name, blocks))
														
 
															         all_sources.extend(sources)
														
 
															     context = combine_context(sections)
														
 
															-
														
 
															     format_guide = _section_format_guide(
														
 
															-        section_id,
														
 
															-        section_title="(auto)",
														
 
															-        ctx={
														
 
															-            "council": council,           # from do_ask parameter
														
 
															-            "planning_zones": [],         # populate if you have zone detection
														
 
															-        }
														
 
															+        section_id, section_title="(auto)",
														
 
															+        ctx={"council": council, "planning_zones": []}
														
 
															     )
														
 
															     prompt = f"""
														
@@ -769,19 +790,36 @@ You are an expert Tasmanian planning and building compliance assistant with deep
 
															 ## ANSWER:
														
 
															 """.strip()
														
 
															-    # BYOK mode: skip Ollama and return the context + prompt so the
														
 
															-    # browser can call its own LLM provider (Claude, GPT, Grok, etc.)
														
 
															+    return prompt, all_sources, sections
														
 
															+
														
 
															+
														
 
															+def do_ask(
														
 
															+    query: str,
														
 
															+    top_k: int = 10,
														
 
															+    council: Optional[str] = None,
														
 
															+    include_ncc: bool = False,
														
 
															+    include_standards: bool = False,
														
 
															+    source_contains: Optional[str] = None,
														
 
															+    scope: str = "state_plus_local",
														
 
															+    section_id: Optional[str] = None,
														
 
															+    context_only: bool = False,
														
 
															+):
														
 
															+    prompt, all_sources, sections = _prepare_ask(
														
 
															+        query, top_k, council, include_ncc, include_standards,
														
 
															+        source_contains, scope, section_id
														
 
															+    )
														
 
															+
														
 
															     if context_only:
														
 
															+        # Extract context from the prompt for BYOK mode
														
 
															+        ctx_start = prompt.find("## CONTEXT")
														
 
															+        ctx_end   = prompt.find("## QUESTION")
														
 
															+        context   = prompt[ctx_start:ctx_end].strip() if ctx_start != -1 and ctx_end != -1 else ""
														
 
															         return {
														
 
															             "context_only": True,
														
 
															             "context": context,
														
 
															             "prompt": prompt,
														
 
															             "sources": all_sources,
														
 
															-            # Include the raw section blocks so the browser can inspect them
														
 
															-            "sections": [
														
 
															-                {"heading": name, "blocks": blocks}
														
 
															-                for name, blocks in sections
														
 
															-            ]
														
 
															+            "sections": [{"heading": name, "blocks": blocks} for name, blocks in sections],
														
 
															         }
														
 
															     answer = ollama_chat(prompt)
														
@@ -874,3 +912,64 @@ def ask_post(request: Request, background_tasks: BackgroundTasks, body: AskBody)
 
															     )
														
 
															     return out
														
 
															+
														
 
															+
														
 
															+# ---------------------------------------------------------------------------
														
 
															+# /ask/stream — Server-Sent Events streaming endpoint
														
 
															+# Embedding + retrieval run synchronously first (fast: ~0.5s).
														
 
															+# Tokens stream as they arrive from Ollama — no waiting for full completion.
														
 
															+#
														
 
															+# SSE event types:
														
 
															+#   {"type": "sources", "sources": [...]}   — sent first, before any tokens
														
 
															+#   {"type": "token",   "text": "..."}      — one per Ollama chunk
														
 
															+#   {"type": "done"}                        — stream complete
														
 
															+#   {"type": "error",   "detail": "..."}    — on failure mid-stream
														
 
															+# ---------------------------------------------------------------------------
														
 
															+@app.post("/ask/stream")
														
 
															+@limiter.limit("20/minute")
														
 
															+def ask_stream(request: Request, body: AskBody):
														
 
															+    _verify_demo_token_if_needed(request)
														
 
															+    qtxt = (body.query or body.question or body.q or body.prompt or "").strip()
														
 
															+    if not qtxt:
														
 
															+        raise HTTPException(status_code=422, detail="Missing query/question")
														
 
															+
														
 
															+    # Embedding + retrieval + prompt building run before streaming starts.
														
 
															+    # Sources are sent as the first SSE event so the UI can render them
														
 
															+    # while tokens are still arriving from Ollama.
														
 
															+    started = time.perf_counter()
														
 
															+    prompt, all_sources, _ = _prepare_ask(
														
 
															+        qtxt, body.top_k, body.council, body.include_ncc,
														
 
															+        body.include_standards, body.source_contains, body.scope, body.section_id
														
 
															+    )
														
 
															+
														
 
															+    ip  = request.client.host if request.client else "0.0.0.0"
														
 
															+    sid = request.headers.get("X-TPR-SID") or request.cookies.get("sid") or ""
														
 
															+
														
 
															+    def generate():
														
 
															+        tokens = []
														
 
															+        try:
														
 
															+            yield f"data: {json.dumps({'type': 'sources', 'sources': all_sources})}\n\n"
														
 
															+            for token in ollama_chat_stream(prompt):
														
 
															+                tokens.append(token)
														
 
															+                yield f"data: {json.dumps({'type': 'token', 'text': token})}\n\n"
														
 
															+            yield f"data: {json.dumps({'type': 'done'})}\n\n"
														
 
															+        except Exception as e:
														
 
															+            logger.error("[stream] error mid-stream: %s", e)
														
 
															+            yield f"data: {json.dumps({'type': 'error', 'detail': str(e)})}\n\n"
														
 
															+        finally:
														
 
															+            # Telemetry written inline at stream end (~1ms SQLite write)
														
 
															+            latency_ms = int((time.perf_counter() - started) * 1000)
														
 
															+            _log_ask(
														
 
															+                datetime.utcnow().isoformat(), sid, ip, qtxt, body.scope,
														
 
															+                body.scope in ("state_only", "state_plus_local"),
														
 
															+                latency_ms, CHAT_MODEL, all_sources, "".join(tokens),
														
 
															+            )
														
 
															+
														
 
															+    return StreamingResponse(
														
 
															+        generate(),
														
 
															+        media_type="text/event-stream",
														
 
															+        headers={
														
 
															+            "Cache-Control": "no-cache",
														
 
															+            "X-Accel-Buffering": "no",   # tell Nginx/Cloudflare not to buffer SSE
														
 
															+        },
														
 
															+    )
														
--- a/public/local_state-planning-scheme.php
+++ b/public/local_state-planning-scheme.php
@@ -279,6 +279,16 @@
 
															     .fb-btn.active-up { border-color: var(--accent); color: var(--accent); }
														
 
															     .fb-btn.active-dn { border-color: var(--danger); color: var(--danger); }
														
 
															+    /* Streaming cursor */
														
 
															+    .streaming-cursor::after {
														
 
															+      content: '▍';
														
 
															+      display: inline-block;
														
 
															+      color: var(--accent);
														
 
															+      animation: blink 0.8s step-end infinite;
														
 
															+      margin-left: 1px;
														
 
															+    }
														
 
															+    @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0} }
														
 
															+
														
 
															     /* Thinking indicator */
														
 
															     .thinking {
														
 
															       display: flex; align-items: center; gap: 10px;
														
@@ -918,27 +928,57 @@ async function ask(queryOverride) {
 
															       addToHistory(rawQuery);
														
 
															     } else {
														
 
															-      // ── Internal Ollama path (unchanged) ───────────────────────────
														
 
															-      const res = await fetch(`${API}/ask`, {
														
 
															+      // ── Internal Ollama path — streaming ──────────────────────────
														
 
															+      const res = await fetch(`${API}/ask/stream`, {
														
 
															         method: 'POST',
														
 
															         headers: { 'Content-Type': 'application/json', 'X-TPR-SID': sessionId },
														
 
															         body: JSON.stringify({ query, council: council || null, top_k: 8, scope })
														
 
															       });
														
 
															-      const raw = await res.text();
														
 
															-      if (!res.ok) throw new Error(`HTTP ${res.status} — ${raw.slice(0,200)}`);
														
 
															-      const data = JSON.parse(raw);
														
 
															+      if (!res.ok) {
														
 
															+        const raw = await res.text();
														
 
															+        throw new Error(`HTTP ${res.status} — ${raw.slice(0,200)}`);
														
 
															+      }
														
 
															       thinkEl.remove();
														
 
															-      lastSources = Array.isArray(data.sources) ? data.sources : [];
														
 
															-
														
 
															+      const msgEl      = appendStreamingMsg(rawQuery, scope);
														
 
															+      const streamText = msgEl.querySelector('.stream-text');
														
 
															+      const reader     = res.body.getReader();
														
 
															+      const decoder    = new TextDecoder();
														
 
															+      let buf = '', fullAnswer = '';
														
 
															+      lastSources = [];
														
 
															+
														
 
															+      outer: while (true) {
														
 
															+        const { done, value } = await reader.read();
														
 
															+        if (done) break;
														
 
															+        buf += decoder.decode(value, { stream: true });
														
 
															+        const lines = buf.split('\n');
														
 
															+        buf = lines.pop();                          // keep incomplete line
														
 
															+        for (const line of lines) {
														
 
															+          if (!line.startsWith('data: ')) continue;
														
 
															+          let evt;
														
 
															+          try { evt = JSON.parse(line.slice(6)); } catch { continue; }
														
 
															+
														
 
															+          if (evt.type === 'sources') {
														
 
															+            lastSources = evt.sources || [];
														
 
															+          } else if (evt.type === 'token') {
														
 
															+            fullAnswer += evt.text;
														
 
															+            streamText.textContent = fullAnswer;    // raw text while streaming
														
 
															+            scrollBottom();
														
 
															+          } else if (evt.type === 'done') {
														
 
															+            break outer;
														
 
															+          } else if (evt.type === 'error') {
														
 
															+            throw new Error(evt.detail || 'Stream error');
														
 
															+          }
														
 
															+        }
														
 
															+      }
														
 
															+
														
 
															+      finalizeStreamingMsg(msgEl, fullAnswer || 'No answer returned.', lastSources);
														
 
															       const latencyMs = Math.round(performance.now() - startedAt);
														
 
															       sendEvent('search_result', {
														
 
															         latency_ms: latencyMs,
														
 
															         topk: lastSources.slice(0,10).map(s => ({ id:`${s.source_file}#p${s.page}`, score:s.score })),
														
 
															-        model: data.model || 'unknown', ok: true,
														
 
															+        model: 'stream', ok: true,
														
 
															       });
														
 
															-
														
 
															-      appendAssistantMsg(data.answer || 'No answer returned.', scope, lastSources, rawQuery, 'internal');
														
 
															       addToHistory(rawQuery);
														
 
															     }
														
 
															   } catch(e) {
														
@@ -983,51 +1023,19 @@ function appendThinking() {
 
															 }
														
 
															 function appendAssistantMsg(answer, scope, sources, query, provider = 'internal') {
														
 
															+  const msgId = `msg-${Date.now()}`;
														
 
															   const div = document.createElement('div');
														
 
															   div.className = 'msg assistant';
														
 
															-
														
 
															-  const providerNames = { internal:'Ollama', anthropic:'Claude', openai:'GPT-4o', grok:'Grok', ollama:'Local Ollama' };
														
 
															-  const providerName = providerNames[provider] || provider;
														
 
															-  const providerIcon = provider === 'internal' ? 'cpu' : 'key';
														
 
															-  const scopeHtml = `
														
 
															-    <div style="display:flex;gap:6px;margin-bottom:10px;flex-wrap:wrap;">
														
 
															-      <div class="scope-badge"><i class="bi bi-filter"></i> ${esc(scope)}</div>
														
 
															-      <div class="scope-badge" style="background:${provider !== 'internal' ? 'rgba(192,132,252,0.1)' : 'var(--accent-dim)'};border-color:${provider !== 'internal' ? 'rgba(192,132,252,0.25)' : 'rgba(45,220,138,0.2)'};color:${provider !== 'internal' ? '#c084fc' : 'var(--accent)'};">
														
 
															-        <i class="bi bi-${providerIcon}"></i> ${esc(providerName)}
														
 
															-      </div>
														
 
															-    </div>`;
														
 
															-  const answerHtml = md2html(answer);
														
 
															-
														
 
															-  let sourcesHtml = '';
														
 
															-  if (sources && sources.length) {
														
 
															-    const chips = sources.map((s, i) => {
														
 
															-      const label = `${s.source_file} p.${s.page}`;
														
 
															-      const score = typeof s.score === 'number' ? `<span class="source-score">${s.score.toFixed(2)}</span>` : '';
														
 
															-      return `<span class="source-chip" data-cite="${esc(`${s.source_file}#p${s.page}`)}" data-index="${i}"
														
 
															-        onclick="openSourceInViewer(${i})">
														
 
															-        <i class="bi bi-file-earmark-text"></i>${esc(label)}${score}
														
 
															-      </span>`;
														
 
															-    }).join('');
														
 
															-    sourcesHtml = `
														
 
															-      <div class="msg-sources">
														
 
															-        <div class="sources-label">Sources</div>
														
 
															-        <div class="source-chips">${chips}</div>
														
 
															-      </div>`;
														
 
															-  }
														
 
															-
														
 
															-  const msgId = `msg-${Date.now()}`;
														
 
															   div.id = msgId;
														
 
															-  // Store context on the element so feedback() can read it without closure issues
														
 
															   div.dataset.query    = query || '';
														
 
															   div.dataset.scope    = scope || '';
														
 
															   div.dataset.provider = provider || 'internal';
														
 
															-  // Store answer as plain text (strip HTML tags) for the feedback payload
														
 
															   div.dataset.answer   = answer.replace(/<[^>]*>/g, '').substring(0, 4000);
														
 
															   div.innerHTML = `
														
 
															     <div class="msg-role"><i class="bi bi-stars"></i> Assistant</div>
														
 
															-    ${scopeHtml}
														
 
															-    <div class="msg-content">${answerHtml}</div>
														
 
															-    ${sourcesHtml}
														
 
															+    ${_scopeHtml(scope, provider)}
														
 
															+    <div class="msg-content">${md2html(answer)}</div>
														
 
															+    ${_sourceChipsHtml(sources)}
														
 
															     <div class="msg-feedback">
														
 
															       <button class="fb-btn" onclick="feedback('${msgId}','up',this)"><i class="bi bi-hand-thumbs-up"></i> Helpful</button>
														
 
															       <button class="fb-btn" onclick="feedback('${msgId}','down',this)"><i class="bi bi-hand-thumbs-down"></i> Not helpful</button>
														
@@ -1037,6 +1045,70 @@ function appendAssistantMsg(answer, scope, sources, query, provider = 'internal'
 
															   scrollBottom();
														
 
															 }
														
 
															+/* ── Streaming message helpers ───────────────────────────────────────── */
														
 
															+
														
 
															+function _scopeHtml(scope, provider) {
														
 
															+  const providerNames = { internal:'Ollama', anthropic:'Claude', openai:'GPT-4o', grok:'Grok', ollama:'Local Ollama' };
														
 
															+  const providerName = providerNames[provider] || provider;
														
 
															+  const providerIcon = provider === 'internal' ? 'cpu' : 'key';
														
 
															+  return `<div style="display:flex;gap:6px;margin-bottom:10px;flex-wrap:wrap;">
														
 
															+    <div class="scope-badge"><i class="bi bi-filter"></i> ${esc(scope)}</div>
														
 
															+    <div class="scope-badge" style="background:${provider !== 'internal' ? 'rgba(192,132,252,0.1)' : 'var(--accent-dim)'};border-color:${provider !== 'internal' ? 'rgba(192,132,252,0.25)' : 'rgba(45,220,138,0.2)'};color:${provider !== 'internal' ? '#c084fc' : 'var(--accent)'};">
														
 
															+      <i class="bi bi-${providerIcon}"></i> ${esc(providerName)}
														
 
															+    </div>
														
 
															+  </div>`;
														
 
															+}
														
 
															+
														
 
															+function _sourceChipsHtml(sources) {
														
 
															+  if (!sources || !sources.length) return '';
														
 
															+  const chips = sources.map((s, i) => {
														
 
															+    const label = `${s.source_file} p.${s.page}`;
														
 
															+    const score = typeof s.score === 'number' ? `<span class="source-score">${s.score.toFixed(2)}</span>` : '';
														
 
															+    return `<span class="source-chip" data-cite="${esc(`${s.source_file}#p${s.page}`)}" data-index="${i}" onclick="openSourceInViewer(${i})">
														
 
															+      <i class="bi bi-file-earmark-text"></i>${esc(label)}${score}
														
 
															+    </span>`;
														
 
															+  }).join('');
														
 
															+  return `<div class="msg-sources"><div class="sources-label">Sources</div><div class="source-chips">${chips}</div></div>`;
														
 
															+}
														
 
															+
														
 
															+// Create a message container for a streaming response.
														
 
															+// Returns the div so the caller can access .querySelector('.stream-text') to append tokens.
														
 
															+function appendStreamingMsg(rawQuery, scope) {
														
 
															+  hideEmpty();
														
 
															+  const msgId = `msg-${Date.now()}`;
														
 
															+  const div = document.createElement('div');
														
 
															+  div.className = 'msg assistant';
														
 
															+  div.id = msgId;
														
 
															+  div.dataset.query    = rawQuery;
														
 
															+  div.dataset.scope    = scope;
														
 
															+  div.dataset.provider = 'internal';
														
 
															+  div.dataset.answer   = '';
														
 
															+  div.innerHTML = `
														
 
															+    <div class="msg-role"><i class="bi bi-stars"></i> Assistant</div>
														
 
															+    ${_scopeHtml(scope, 'internal')}
														
 
															+    <div class="msg-content"><span class="stream-text streaming-cursor"></span></div>
														
 
															+  `;
														
 
															+  chatThread.appendChild(div);
														
 
															+  scrollBottom();
														
 
															+  return div;
														
 
															+}
														
 
															+
														
 
															+// Called when the stream is complete: renders markdown, appends sources + feedback.
														
 
															+function finalizeStreamingMsg(msgEl, fullAnswer, sources) {
														
 
															+  const contentEl = msgEl.querySelector('.msg-content');
														
 
															+  contentEl.innerHTML = md2html(fullAnswer);
														
 
															+  msgEl.dataset.answer = fullAnswer.replace(/<[^>]*>/g, '').substring(0, 4000);
														
 
															+
														
 
															+  const msgId = msgEl.id;
														
 
															+  const trailing = _sourceChipsHtml(sources) + `
														
 
															+    <div class="msg-feedback">
														
 
															+      <button class="fb-btn" onclick="feedback('${msgId}','up',this)"><i class="bi bi-hand-thumbs-up"></i> Helpful</button>
														
 
															+      <button class="fb-btn" onclick="feedback('${msgId}','down',this)"><i class="bi bi-hand-thumbs-down"></i> Not helpful</button>
														
 
															+    </div>`;
														
 
															+  msgEl.insertAdjacentHTML('beforeend', trailing);
														
 
															+  scrollBottom();
														
 
															+}
														
 
															+
														
 
															 function appendErrorMsg(msg) {
														
 
															   const div = document.createElement('div');
														
 
															   div.className = 'msg assistant';