1 개월 전 · f86bced6e5
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -59,6 +59,9 @@ Multiple display devices can connect simultaneously.
 
				   - Requires HuggingFace token and accepted licence for `pyannote/speaker-diarization-3.1` and `pyannote/segmentation-3.0`
			
 
				   - Sortformer backend exists but requires NVIDIA NeMo — not installed; diart is the active backend
			
 
				 
			
 
				+RTX 5060 Ti / CUDA 13 note: ctranslate2 must be pinned to 4.5.0. Install nvidia-cublas-cu12, nvidia-cudnn-cu12, nvidia-cuda-runtime-cu12 explicitly. setuptools must be <82 to avoid pkg_resources import error. Confirmed working: CUDA devices: 1.
			
 
				+
			
 
				+
			
 
				 ### WhisperLiveKit Launch Notes
			
 
				 
			
 
				 `bridge/whisper_launcher.py` must be used instead of `wlk` directly. It applies two patches before loading WhisperLiveKit:
			
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -7,7 +7,7 @@ receives transcription + speaker diarization, buffers sentences, and
 
				 publishes rolling 3-line JSON to Mosquitto MQTT for the e-ink display.
			
 
				 
			
 
				 Start WhisperLiveKit with:
			
 
				-    wlk --model large-v3 --language en --diarization
			
 
				+    wlk --model_size large-v3 --language en --diarization
			
 
				 
			
 
				 Run this script:
			
 
				     python bridge.py
			
@@ -46,7 +46,7 @@ DISPLAY_LINES    = 3
 
				 # Set to a device index (integer) to force a specific microphone.
			
 
				 # Leave as None to use the Windows default input device.
			
 
				 # Run bridge.py once to see available device indices printed at startup.
			
 
				-AUDIO_DEVICE: int | None = None
			
 
				+AUDIO_DEVICE: int | None = 12
			
 
				 
			
 
				 SPEAKERS_FILE = Path(__file__).parent / "speakers.json"
			
 
				 
			
@@ -218,6 +218,16 @@ def build_mqtt_client() -> mqtt.Client:
 
				 # ── WebSocket + audio pipeline ────────────────────────────────────────────────
			
 
				 
			
 
				 async def _sender(ws, queue: asyncio.Queue) -> None:
			
 
				+    # Send config handshake first — WhisperLiveKit needs this before audio
			
 
				+    config = json.dumps({
			
 
				+        "uid": "bridge-client",
			
 
				+        "language": "en",
			
 
				+        "task": "transcribe",
			
 
				+        "model_size": "large-v3",
			
 
				+        "use_vad": True,
			
 
				+    })
			
 
				+    await ws.send(config)
			
 
				+    # Drain any stale chunks
			
 
				     while not queue.empty():
			
 
				         queue.get_nowait()
			
 
				     while True:
			
@@ -306,58 +316,55 @@ def _choose_audio_device() -> int | None:
 
				     return idx
			
 
				 
			
 
				 
			
 
				-async def audio_ws_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+# Remove the WebSocket audio sender entirely.
			
 
				+# Use sounddevice → AudioProcessor directly via the Python API.
			
 
				+
			
 
				+from whisperlivekit import AudioProcessor, TranscriptionEngine
			
 
				+
			
 
				+async def audio_processor_loop(state: BridgeState, mqtt_client: mqtt.Client, engine: TranscriptionEngine) -> None:
			
 
				     audio_queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=120)
			
 
				     loop = asyncio.get_running_loop()
			
 
				 
			
 
				     def audio_callback(indata: np.ndarray, frames: int, time_info, status) -> None:
			
 
				         if status:
			
 
				             print(f"[Audio] {status}")
			
 
				-        chunk = indata.tobytes()
			
 
				-        def _put():
			
 
				-            try:
			
 
				-                audio_queue.put_nowait(chunk)
			
 
				-            except asyncio.QueueFull:
			
 
				-                pass
			
 
				-        loop.call_soon_threadsafe(_put)
			
 
				+        chunk = indata.astype(np.float32).tobytes()
			
 
				+        loop.call_soon_threadsafe(lambda: audio_queue.put_nowait(chunk) if not audio_queue.full() else None)
			
 
				 
			
 
				     device = _choose_audio_device()
			
 
				     if device is None:
			
 
				-        print("[Audio] No input device available — audio pipeline cannot start.")
			
 
				+        print("[Audio] No input device — cannot start.")
			
 
				         return
			
 
				 
			
 
				-    with sd.InputStream(
			
 
				-        device=device,
			
 
				-        samplerate=SAMPLE_RATE,
			
 
				-        channels=CHANNELS,
			
 
				-        dtype="int16",
			
 
				-        blocksize=BLOCKSIZE,
			
 
				-        callback=audio_callback,
			
 
				-    ):
			
 
				-        flusher   = asyncio.create_task(_flusher(state, mqtt_client))
			
 
				-        reloader  = asyncio.create_task(_speaker_reloader(state))
			
 
				-        try:
			
 
				+    audio_processor = AudioProcessor(transcription_engine=engine)
			
 
				+    results_generator = await audio_processor.create_tasks()
			
 
				+
			
 
				+    async def _receive_results():
			
 
				+        async for response in results_generator:
			
 
				+            # response is a FrontData dataclass, not a dict
			
 
				+            text     = (getattr(response, "text", None) or getattr(response, "buffer_transcription", None) or "").strip()
			
 
				+            is_final = getattr(response, "is_final", False) or getattr(response, "end_of_segment", False)
			
 
				+            speaker  = getattr(response, "speaker", None)
			
 
				+            if is_final and text:
			
 
				+                print(f"[Whisper] ({speaker or '?'}) {text}")
			
 
				+                state.push_final(text, speaker, mqtt_client)
			
 
				+
			
 
				+    async def _send_audio():
			
 
				+        with sd.InputStream(
			
 
				+            device=device, samplerate=SAMPLE_RATE, channels=CHANNELS,
			
 
				+            dtype="float32", blocksize=BLOCKSIZE, callback=audio_callback,
			
 
				+        ):
			
 
				             while True:
			
 
				-                try:
			
 
				-                    print(f"[WS] Connecting to {WS_URL} ...")
			
 
				-                    async with websockets.connect(WS_URL, max_size=2**23) as ws:
			
 
				-                        print("[WS] Connected")
			
 
				-                        send_t = asyncio.create_task(_sender(ws, audio_queue))
			
 
				-                        recv_t = asyncio.create_task(_receiver(ws, state, mqtt_client))
			
 
				-                        done, pending = await asyncio.wait(
			
 
				-                            [send_t, recv_t], return_when=asyncio.FIRST_COMPLETED
			
 
				-                        )
			
 
				-                        for t in pending:
			
 
				-                            t.cancel()
			
 
				-                        for t in done:
			
 
				-                            if not t.cancelled() and (exc := t.exception()):
			
 
				-                                print(f"[WS] Task error: {exc}")
			
 
				-                except (websockets.ConnectionClosed, OSError, ConnectionRefusedError) as exc:
			
 
				-                    print(f"[WS] {exc}  — retrying in 3 s...")
			
 
				-                    await asyncio.sleep(3)
			
 
				-        finally:
			
 
				-            flusher.cancel()
			
 
				-            reloader.cancel()
			
 
				+                chunk = await audio_queue.get()
			
 
				+                await audio_processor.process_audio(chunk)
			
 
				+
			
 
				+    flusher  = asyncio.create_task(_flusher(state, mqtt_client))
			
 
				+    reloader = asyncio.create_task(_speaker_reloader(state))
			
 
				+    try:
			
 
				+        await asyncio.gather(_send_audio(), _receive_results())
			
 
				+    finally:
			
 
				+        flusher.cancel()
			
 
				+        reloader.cancel()
			
 
				 
			
 
				 
			
 
				 def run_async_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
@@ -367,17 +374,17 @@ def run_async_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
 
				 # ── Entry point ───────────────────────────────────────────────────────────────
			
 
				 
			
 
				 def main() -> None:
			
 
				+    from whisperlivekit import TranscriptionEngine
			
 
				     state       = BridgeState()
			
 
				     mqtt_client = build_mqtt_client()
			
 
				+    engine      = TranscriptionEngine(model_size="large-v3", lan="en", diarization=False)
			
 
				 
			
 
				-    ws_thread = threading.Thread(
			
 
				-        target=run_async_loop, args=(state, mqtt_client), daemon=True
			
 
				-    )
			
 
				-    ws_thread.start()
			
 
				-    print(f"[Bridge] Speaker names loaded from {SPEAKERS_FILE}")
			
 
				-    print("[Bridge] Audio pipeline running — speaker admin at http://localhost:8001")
			
 
				-    print("[Bridge] Close this window to quit")
			
 
				+    def _run():
			
 
				+        asyncio.run(audio_processor_loop(state, mqtt_client, engine))
			
 
				 
			
 
				+    ws_thread = threading.Thread(target=_run, daemon=True)
			
 
				+    ws_thread.start()
			
 
				+    print("[Bridge] Audio pipeline running")
			
 
				     try:
			
 
				         ws_thread.join()
			
 
				     except KeyboardInterrupt:
			
--- a/bridge/requirements.txt
+++ b/bridge/requirements.txt
@@ -1,9 +1,21 @@
 
				+# Core bridge dependencies
			
 
				 paho-mqtt>=2.0
			
 
				 websockets>=12.0
			
 
				 sounddevice>=0.4.6
			
 
				-numpy>=1.24
			
 
				 fastapi>=0.111
			
 
				 uvicorn>=0.29
			
 
				 python-multipart>=0.0.9
			
 
				 miniaudio>=1.59
			
 
				 imageio-ffmpeg>=0.4.8
			
 
				+
			
 
				+# numpy — floor set by pyannote-core/metrics requirement
			
 
				+numpy>=2.0
			
 
				+
			
 
				+# ctranslate2 pinned — 4.5.0 confirmed working with RTX 5060 Ti / CUDA 13 + CUDA 12 libs
			
 
				+ctranslate2==4.5.0
			
 
				+setuptools<82
			
 
				+
			
 
				+# CUDA 12 runtime libraries (required on CUDA 13 drivers for ctranslate2 compatibility)
			
 
				+nvidia-cublas-cu12
			
 
				+nvidia-cudnn-cu12
			
 
				+nvidia-cuda-runtime-cu12
			
--- a/start.bat
+++ b/start.bat
@@ -72,7 +72,8 @@ echo.
 
				 
			
 
				 :: Activate venv and launch WhisperLiveKit via the compatibility launcher
			
 
				 :: The launcher patches torchaudio for diart and makes ffmpeg available.
			
 
				-start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && echo Starting WhisperLiveKit (%WHISPER_MODEL%) with speaker diarization... && python bridge\whisper_launcher.py --model %WHISPER_MODEL% --lan en --diarization-backend diart"
			
 
				+:: start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && echo Starting WhisperLiveKit (%WHISPER_MODEL%) with speaker diarization... && python bridge\whisper_launcher.py --model %WHISPER_MODEL% --language en --backend faster-whisper --diarization-backend diart"
			
 
				+start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && set IMAGEIO_FFMPEG_EXE=.venv\Lib\site-packages\imageio_ffmpeg\binaries\ffmpeg-win-x86_64-v7.1.exe && wlk --model %WHISPER_MODEL% --language en --backend faster-whisper --diarization-backend diart"
			
 
				 
			
 
				 :: Give Whisper more time on first run — diarization model downloads ~500 MB
			
 
				 timeout /t 15 /nobreak >nul