|
@@ -196,32 +196,26 @@ def ollama_embed(text: str) -> List[float]:
|
|
|
raise HTTPException(status_code=502, detail="Embedding service returned unexpected response")
|
|
raise HTTPException(status_code=502, detail="Embedding service returned unexpected response")
|
|
|
return data["embedding"]
|
|
return data["embedding"]
|
|
|
|
|
|
|
|
-def ollama_chat(prompt: str) -> str:
|
|
|
|
|
- """
|
|
|
|
|
- Send a prompt to Ollama and return the generated text.
|
|
|
|
|
-
|
|
|
|
|
- keep_alive MUST be a top-level key — putting it inside options{} causes
|
|
|
|
|
- Ollama to silently ignore it and unload the model between requests.
|
|
|
|
|
|
|
+_OLLAMA_GENERATE_BODY = lambda prompt, stream: {
|
|
|
|
|
+ "model": CHAT_MODEL,
|
|
|
|
|
+ "prompt": prompt,
|
|
|
|
|
+ "stream": stream,
|
|
|
|
|
+ "options": {
|
|
|
|
|
+ "num_ctx": OLLAMA_NUM_CTX,
|
|
|
|
|
+ "num_predict": OLLAMA_NUM_PREDICT,
|
|
|
|
|
+ "temperature": OLLAMA_TEMPERATURE,
|
|
|
|
|
+ "top_p": 0.9,
|
|
|
|
|
+ "repeat_penalty": 1.1,
|
|
|
|
|
+ },
|
|
|
|
|
+ "keep_alive": -1, # keep model resident in VRAM — MUST be top-level, not inside options
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
- num_ctx is fixed at 6144. Changing it between requests forces Ollama to
|
|
|
|
|
- reload the model (KV cache is resized), adding ~3–5 s of cold-start latency.
|
|
|
|
|
- """
|
|
|
|
|
|
|
+def ollama_chat(prompt: str) -> str:
|
|
|
|
|
+ """Send a prompt to Ollama and return the full generated text (non-streaming)."""
|
|
|
try:
|
|
try:
|
|
|
r = requests.post(
|
|
r = requests.post(
|
|
|
f"{OLLAMA_URL}/api/generate",
|
|
f"{OLLAMA_URL}/api/generate",
|
|
|
- json={
|
|
|
|
|
- "model": CHAT_MODEL,
|
|
|
|
|
- "prompt": prompt,
|
|
|
|
|
- "stream": False,
|
|
|
|
|
- "options": {
|
|
|
|
|
- "num_ctx": OLLAMA_NUM_CTX,
|
|
|
|
|
- "num_predict": OLLAMA_NUM_PREDICT,
|
|
|
|
|
- "temperature": OLLAMA_TEMPERATURE,
|
|
|
|
|
- "top_p": 0.9,
|
|
|
|
|
- "repeat_penalty": 1.1,
|
|
|
|
|
- },
|
|
|
|
|
- "keep_alive": -1, # keep model resident in VRAM between requests
|
|
|
|
|
- },
|
|
|
|
|
|
|
+ json=_OLLAMA_GENERATE_BODY(prompt, False),
|
|
|
timeout=180
|
|
timeout=180
|
|
|
)
|
|
)
|
|
|
r.raise_for_status()
|
|
r.raise_for_status()
|
|
@@ -234,8 +228,45 @@ def ollama_chat(prompt: str) -> str:
|
|
|
except requests.HTTPError as e:
|
|
except requests.HTTPError as e:
|
|
|
logger.error("Ollama chat HTTP %s: %s", e.response.status_code, e.response.text[:200])
|
|
logger.error("Ollama chat HTTP %s: %s", e.response.status_code, e.response.text[:200])
|
|
|
raise HTTPException(status_code=502, detail="LLM service error")
|
|
raise HTTPException(status_code=502, detail="LLM service error")
|
|
|
- data = r.json()
|
|
|
|
|
- return data.get("response", "").strip()
|
|
|
|
|
|
|
+ return r.json().get("response", "").strip()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def ollama_chat_stream(prompt: str):
|
|
|
|
|
+ """
|
|
|
|
|
+ Yield raw text tokens from Ollama using streaming mode.
|
|
|
|
|
+ Each yielded value is a string (one or more characters).
|
|
|
|
|
+ Raises HTTPException on connection/HTTP errors before the first token.
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ r = requests.post(
|
|
|
|
|
+ f"{OLLAMA_URL}/api/generate",
|
|
|
|
|
+ json=_OLLAMA_GENERATE_BODY(prompt, True),
|
|
|
|
|
+ stream=True,
|
|
|
|
|
+ timeout=180
|
|
|
|
|
+ )
|
|
|
|
|
+ r.raise_for_status()
|
|
|
|
|
+ except requests.Timeout:
|
|
|
|
|
+ logger.error("Ollama stream timeout (url=%s model=%s)", OLLAMA_URL, CHAT_MODEL)
|
|
|
|
|
+ raise HTTPException(status_code=503, detail="LLM service timed out")
|
|
|
|
|
+ except requests.ConnectionError:
|
|
|
|
|
+ logger.error("Ollama stream connection error (url=%s)", OLLAMA_URL)
|
|
|
|
|
+ raise HTTPException(status_code=503, detail="LLM service unavailable")
|
|
|
|
|
+ except requests.HTTPError as e:
|
|
|
|
|
+ logger.error("Ollama stream HTTP %s: %s", e.response.status_code, e.response.text[:200])
|
|
|
|
|
+ raise HTTPException(status_code=502, detail="LLM service error")
|
|
|
|
|
+
|
|
|
|
|
+ for line in r.iter_lines():
|
|
|
|
|
+ if not line:
|
|
|
|
|
+ continue
|
|
|
|
|
+ try:
|
|
|
|
|
+ chunk = json.loads(line)
|
|
|
|
|
+ except json.JSONDecodeError:
|
|
|
|
|
+ continue
|
|
|
|
|
+ token = chunk.get("response", "")
|
|
|
|
|
+ if token:
|
|
|
|
|
+ yield token
|
|
|
|
|
+ if chunk.get("done"):
|
|
|
|
|
+ break
|
|
|
|
|
|
|
|
def _scroll_points(collection: str, qfilter=None, include_vector: bool=False, page_size: int=200):
|
|
def _scroll_points(collection: str, qfilter=None, include_vector: bool=False, page_size: int=200):
|
|
|
"""
|
|
"""
|
|
@@ -661,7 +692,7 @@ def _allowed(p: dict, scope: str, cslug: Optional[str]) -> bool:
|
|
|
return corp == "tps" or (corp == "lps" and cslug and council == cslug)
|
|
return corp == "tps" or (corp == "lps" and cslug and council == cslug)
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
|
-def do_ask(
|
|
|
|
|
|
|
+def _prepare_ask(
|
|
|
query: str,
|
|
query: str,
|
|
|
top_k: int = 10,
|
|
top_k: int = 10,
|
|
|
council: Optional[str] = None,
|
|
council: Optional[str] = None,
|
|
@@ -670,15 +701,16 @@ def do_ask(
|
|
|
source_contains: Optional[str] = None,
|
|
source_contains: Optional[str] = None,
|
|
|
scope: str = "state_plus_local",
|
|
scope: str = "state_plus_local",
|
|
|
section_id: Optional[str] = None,
|
|
section_id: Optional[str] = None,
|
|
|
- context_only: bool = False,
|
|
|
|
|
-):
|
|
|
|
|
- top_k = max(1, min(top_k, 30)) # clamp: at least 1, at most 30
|
|
|
|
|
|
|
+) -> tuple:
|
|
|
|
|
+ """
|
|
|
|
|
+ Embed the query, run Qdrant retrieval, and build the LLM prompt.
|
|
|
|
|
+ Returns (prompt, all_sources) — does NOT call the LLM.
|
|
|
|
|
+ Shared by do_ask() and the streaming endpoint.
|
|
|
|
|
+ """
|
|
|
|
|
+ top_k = max(1, min(top_k, 30))
|
|
|
vec = ollama_embed(query)
|
|
vec = ollama_embed(query)
|
|
|
cslug = slug(council) if council else None
|
|
cslug = slug(council) if council else None
|
|
|
|
|
|
|
|
- # Build the list of (section_heading, qdrant_filter) pairs based on scope.
|
|
|
|
|
- # Each pair is searched independently so we can control the chunk budget
|
|
|
|
|
- # per corpus — avoids TPS drowning out LPS results or vice versa.
|
|
|
|
|
scopes: List[Tuple[str, qmodels.Filter]] = []
|
|
scopes: List[Tuple[str, qmodels.Filter]] = []
|
|
|
if scope in ("state_only", "state_plus_local", "any"):
|
|
if scope in ("state_only", "state_plus_local", "any"):
|
|
|
scopes.append(("Tasmanian Planning Scheme (SPP)", filter_tps()))
|
|
scopes.append(("Tasmanian Planning Scheme (SPP)", filter_tps()))
|
|
@@ -689,11 +721,8 @@ def do_ask(
|
|
|
if include_standards:
|
|
if include_standards:
|
|
|
scopes.append(("Australian Standards (AS)", filter_as()))
|
|
scopes.append(("Australian Standards (AS)", filter_as()))
|
|
|
|
|
|
|
|
- # Apply additional filename filter if requested (AND)
|
|
|
|
|
scopes = [(name, with_source_contains(flt, source_contains)) for name, flt in scopes]
|
|
scopes = [(name, with_source_contains(flt, source_contains)) for name, flt in scopes]
|
|
|
|
|
|
|
|
- # Divide top_k across scopes: SPP and LPS each get ~1/3, the remainder
|
|
|
|
|
- # is split evenly across any extra corpora (NCC, AS).
|
|
|
|
|
per_spp = max(3, top_k // 3) if any(n.startswith("Tasmanian Planning Scheme") for n, _ in scopes) else 0
|
|
per_spp = max(3, top_k // 3) if any(n.startswith("Tasmanian Planning Scheme") for n, _ in scopes) else 0
|
|
|
per_lps = max(3, top_k // 3) if any(n.startswith("Local Provisions Schedule") for n, _ in scopes) else 0
|
|
per_lps = max(3, top_k // 3) if any(n.startswith("Local Provisions Schedule") for n, _ in scopes) else 0
|
|
|
remaining = max(1, top_k - (per_spp + per_lps))
|
|
remaining = max(1, top_k - (per_spp + per_lps))
|
|
@@ -716,23 +745,15 @@ def do_ask(
|
|
|
if lim <= 0:
|
|
if lim <= 0:
|
|
|
continue
|
|
continue
|
|
|
hits = q_search(vec, flt, lim)
|
|
hits = q_search(vec, flt, lim)
|
|
|
-
|
|
|
|
|
- # Guardrail: drop any hit that violates scope/council
|
|
|
|
|
hits = [h for h in hits if _allowed(h.payload or {}, scope, cslug)]
|
|
hits = [h for h in hits if _allowed(h.payload or {}, scope, cslug)]
|
|
|
-
|
|
|
|
|
blocks, sources = render_blocks(hits)
|
|
blocks, sources = render_blocks(hits)
|
|
|
sections.append((name, blocks))
|
|
sections.append((name, blocks))
|
|
|
all_sources.extend(sources)
|
|
all_sources.extend(sources)
|
|
|
|
|
|
|
|
context = combine_context(sections)
|
|
context = combine_context(sections)
|
|
|
-
|
|
|
|
|
format_guide = _section_format_guide(
|
|
format_guide = _section_format_guide(
|
|
|
- section_id,
|
|
|
|
|
- section_title="(auto)",
|
|
|
|
|
- ctx={
|
|
|
|
|
- "council": council, # from do_ask parameter
|
|
|
|
|
- "planning_zones": [], # populate if you have zone detection
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ section_id, section_title="(auto)",
|
|
|
|
|
+ ctx={"council": council, "planning_zones": []}
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
prompt = f"""
|
|
prompt = f"""
|
|
@@ -769,19 +790,36 @@ You are an expert Tasmanian planning and building compliance assistant with deep
|
|
|
## ANSWER:
|
|
## ANSWER:
|
|
|
""".strip()
|
|
""".strip()
|
|
|
|
|
|
|
|
- # BYOK mode: skip Ollama and return the context + prompt so the
|
|
|
|
|
- # browser can call its own LLM provider (Claude, GPT, Grok, etc.)
|
|
|
|
|
|
|
+ return prompt, all_sources, sections
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def do_ask(
|
|
|
|
|
+ query: str,
|
|
|
|
|
+ top_k: int = 10,
|
|
|
|
|
+ council: Optional[str] = None,
|
|
|
|
|
+ include_ncc: bool = False,
|
|
|
|
|
+ include_standards: bool = False,
|
|
|
|
|
+ source_contains: Optional[str] = None,
|
|
|
|
|
+ scope: str = "state_plus_local",
|
|
|
|
|
+ section_id: Optional[str] = None,
|
|
|
|
|
+ context_only: bool = False,
|
|
|
|
|
+):
|
|
|
|
|
+ prompt, all_sources, sections = _prepare_ask(
|
|
|
|
|
+ query, top_k, council, include_ncc, include_standards,
|
|
|
|
|
+ source_contains, scope, section_id
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
if context_only:
|
|
if context_only:
|
|
|
|
|
+ # Extract context from the prompt for BYOK mode
|
|
|
|
|
+ ctx_start = prompt.find("## CONTEXT")
|
|
|
|
|
+ ctx_end = prompt.find("## QUESTION")
|
|
|
|
|
+ context = prompt[ctx_start:ctx_end].strip() if ctx_start != -1 and ctx_end != -1 else ""
|
|
|
return {
|
|
return {
|
|
|
"context_only": True,
|
|
"context_only": True,
|
|
|
"context": context,
|
|
"context": context,
|
|
|
"prompt": prompt,
|
|
"prompt": prompt,
|
|
|
"sources": all_sources,
|
|
"sources": all_sources,
|
|
|
- # Include the raw section blocks so the browser can inspect them
|
|
|
|
|
- "sections": [
|
|
|
|
|
- {"heading": name, "blocks": blocks}
|
|
|
|
|
- for name, blocks in sections
|
|
|
|
|
- ]
|
|
|
|
|
|
|
+ "sections": [{"heading": name, "blocks": blocks} for name, blocks in sections],
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
answer = ollama_chat(prompt)
|
|
answer = ollama_chat(prompt)
|
|
@@ -874,3 +912,64 @@ def ask_post(request: Request, background_tasks: BackgroundTasks, body: AskBody)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
return out
|
|
return out
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+# /ask/stream — Server-Sent Events streaming endpoint
|
|
|
|
|
+# Embedding + retrieval run synchronously first (fast: ~0.5s).
|
|
|
|
|
+# Tokens stream as they arrive from Ollama — no waiting for full completion.
|
|
|
|
|
+#
|
|
|
|
|
+# SSE event types:
|
|
|
|
|
+# {"type": "sources", "sources": [...]} — sent first, before any tokens
|
|
|
|
|
+# {"type": "token", "text": "..."} — one per Ollama chunk
|
|
|
|
|
+# {"type": "done"} — stream complete
|
|
|
|
|
+# {"type": "error", "detail": "..."} — on failure mid-stream
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+@app.post("/ask/stream")
|
|
|
|
|
+@limiter.limit("20/minute")
|
|
|
|
|
+def ask_stream(request: Request, body: AskBody):
|
|
|
|
|
+ _verify_demo_token_if_needed(request)
|
|
|
|
|
+ qtxt = (body.query or body.question or body.q or body.prompt or "").strip()
|
|
|
|
|
+ if not qtxt:
|
|
|
|
|
+ raise HTTPException(status_code=422, detail="Missing query/question")
|
|
|
|
|
+
|
|
|
|
|
+ # Embedding + retrieval + prompt building run before streaming starts.
|
|
|
|
|
+ # Sources are sent as the first SSE event so the UI can render them
|
|
|
|
|
+ # while tokens are still arriving from Ollama.
|
|
|
|
|
+ started = time.perf_counter()
|
|
|
|
|
+ prompt, all_sources, _ = _prepare_ask(
|
|
|
|
|
+ qtxt, body.top_k, body.council, body.include_ncc,
|
|
|
|
|
+ body.include_standards, body.source_contains, body.scope, body.section_id
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ ip = request.client.host if request.client else "0.0.0.0"
|
|
|
|
|
+ sid = request.headers.get("X-TPR-SID") or request.cookies.get("sid") or ""
|
|
|
|
|
+
|
|
|
|
|
+ def generate():
|
|
|
|
|
+ tokens = []
|
|
|
|
|
+ try:
|
|
|
|
|
+ yield f"data: {json.dumps({'type': 'sources', 'sources': all_sources})}\n\n"
|
|
|
|
|
+ for token in ollama_chat_stream(prompt):
|
|
|
|
|
+ tokens.append(token)
|
|
|
|
|
+ yield f"data: {json.dumps({'type': 'token', 'text': token})}\n\n"
|
|
|
|
|
+ yield f"data: {json.dumps({'type': 'done'})}\n\n"
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error("[stream] error mid-stream: %s", e)
|
|
|
|
|
+ yield f"data: {json.dumps({'type': 'error', 'detail': str(e)})}\n\n"
|
|
|
|
|
+ finally:
|
|
|
|
|
+ # Telemetry written inline at stream end (~1ms SQLite write)
|
|
|
|
|
+ latency_ms = int((time.perf_counter() - started) * 1000)
|
|
|
|
|
+ _log_ask(
|
|
|
|
|
+ datetime.utcnow().isoformat(), sid, ip, qtxt, body.scope,
|
|
|
|
|
+ body.scope in ("state_only", "state_plus_local"),
|
|
|
|
|
+ latency_ms, CHAT_MODEL, all_sources, "".join(tokens),
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ return StreamingResponse(
|
|
|
|
|
+ generate(),
|
|
|
|
|
+ media_type="text/event-stream",
|
|
|
|
|
+ headers={
|
|
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
|
|
+ "X-Accel-Buffering": "no", # tell Nginx/Cloudflare not to buffer SSE
|
|
|
|
|
+ },
|
|
|
|
|
+ )
|