소스 검색

Improvements on bridge.py

Benjamin Harris 1 개월 전
부모
커밋
f86bced6e5
4개의 변경된 파일74개의 추가작업 그리고 51개의 파일을 삭제
  1. 3 0
      CLAUDE.md
  2. 56 49
      bridge/bridge.py
  3. 13 1
      bridge/requirements.txt
  4. 2 1
      start.bat

+ 3 - 0
CLAUDE.md

@@ -59,6 +59,9 @@ Multiple display devices can connect simultaneously.
   - Requires HuggingFace token and accepted licence for `pyannote/speaker-diarization-3.1` and `pyannote/segmentation-3.0`
   - Sortformer backend exists but requires NVIDIA NeMo — not installed; diart is the active backend
 
+RTX 5060 Ti / CUDA 13 note: ctranslate2 must be pinned to 4.5.0. Install nvidia-cublas-cu12, nvidia-cudnn-cu12, nvidia-cuda-runtime-cu12 explicitly. setuptools must be <82 to avoid pkg_resources import error. Confirmed working: CUDA devices: 1.
+
+
 ### WhisperLiveKit Launch Notes
 
 `bridge/whisper_launcher.py` must be used instead of `wlk` directly. It applies two patches before loading WhisperLiveKit:

+ 56 - 49
bridge/bridge.py

@@ -7,7 +7,7 @@ receives transcription + speaker diarization, buffers sentences, and
 publishes rolling 3-line JSON to Mosquitto MQTT for the e-ink display.
 
 Start WhisperLiveKit with:
-    wlk --model large-v3 --language en --diarization
+    wlk --model_size large-v3 --language en --diarization
 
 Run this script:
     python bridge.py
@@ -46,7 +46,7 @@ DISPLAY_LINES    = 3
 # Set to a device index (integer) to force a specific microphone.
 # Leave as None to use the Windows default input device.
 # Run bridge.py once to see available device indices printed at startup.
-AUDIO_DEVICE: int | None = None
+AUDIO_DEVICE: int | None = 12
 
 SPEAKERS_FILE = Path(__file__).parent / "speakers.json"
 
@@ -218,6 +218,16 @@ def build_mqtt_client() -> mqtt.Client:
 # ── WebSocket + audio pipeline ────────────────────────────────────────────────
 
 async def _sender(ws, queue: asyncio.Queue) -> None:
+    # Send config handshake first — WhisperLiveKit needs this before audio
+    config = json.dumps({
+        "uid": "bridge-client",
+        "language": "en",
+        "task": "transcribe",
+        "model_size": "large-v3",
+        "use_vad": True,
+    })
+    await ws.send(config)
+    # Drain any stale chunks
     while not queue.empty():
         queue.get_nowait()
     while True:
@@ -306,58 +316,55 @@ def _choose_audio_device() -> int | None:
     return idx
 
 
-async def audio_ws_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
+# Remove the WebSocket audio sender entirely.
+# Use sounddevice → AudioProcessor directly via the Python API.
+
+from whisperlivekit import AudioProcessor, TranscriptionEngine
+
+async def audio_processor_loop(state: BridgeState, mqtt_client: mqtt.Client, engine: TranscriptionEngine) -> None:
     audio_queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=120)
     loop = asyncio.get_running_loop()
 
     def audio_callback(indata: np.ndarray, frames: int, time_info, status) -> None:
         if status:
             print(f"[Audio] {status}")
-        chunk = indata.tobytes()
-        def _put():
-            try:
-                audio_queue.put_nowait(chunk)
-            except asyncio.QueueFull:
-                pass
-        loop.call_soon_threadsafe(_put)
+        chunk = indata.astype(np.float32).tobytes()
+        loop.call_soon_threadsafe(lambda: audio_queue.put_nowait(chunk) if not audio_queue.full() else None)
 
     device = _choose_audio_device()
     if device is None:
-        print("[Audio] No input device available  audio pipeline cannot start.")
+        print("[Audio] No input device — cannot start.")
         return
 
-    with sd.InputStream(
-        device=device,
-        samplerate=SAMPLE_RATE,
-        channels=CHANNELS,
-        dtype="int16",
-        blocksize=BLOCKSIZE,
-        callback=audio_callback,
-    ):
-        flusher   = asyncio.create_task(_flusher(state, mqtt_client))
-        reloader  = asyncio.create_task(_speaker_reloader(state))
-        try:
+    audio_processor = AudioProcessor(transcription_engine=engine)
+    results_generator = await audio_processor.create_tasks()
+
+    async def _receive_results():
+        async for response in results_generator:
+            # response is a FrontData dataclass, not a dict
+            text     = (getattr(response, "text", None) or getattr(response, "buffer_transcription", None) or "").strip()
+            is_final = getattr(response, "is_final", False) or getattr(response, "end_of_segment", False)
+            speaker  = getattr(response, "speaker", None)
+            if is_final and text:
+                print(f"[Whisper] ({speaker or '?'}) {text}")
+                state.push_final(text, speaker, mqtt_client)
+
+    async def _send_audio():
+        with sd.InputStream(
+            device=device, samplerate=SAMPLE_RATE, channels=CHANNELS,
+            dtype="float32", blocksize=BLOCKSIZE, callback=audio_callback,
+        ):
             while True:
-                try:
-                    print(f"[WS] Connecting to {WS_URL} ...")
-                    async with websockets.connect(WS_URL, max_size=2**23) as ws:
-                        print("[WS] Connected")
-                        send_t = asyncio.create_task(_sender(ws, audio_queue))
-                        recv_t = asyncio.create_task(_receiver(ws, state, mqtt_client))
-                        done, pending = await asyncio.wait(
-                            [send_t, recv_t], return_when=asyncio.FIRST_COMPLETED
-                        )
-                        for t in pending:
-                            t.cancel()
-                        for t in done:
-                            if not t.cancelled() and (exc := t.exception()):
-                                print(f"[WS] Task error: {exc}")
-                except (websockets.ConnectionClosed, OSError, ConnectionRefusedError) as exc:
-                    print(f"[WS] {exc}  — retrying in 3 s...")
-                    await asyncio.sleep(3)
-        finally:
-            flusher.cancel()
-            reloader.cancel()
+                chunk = await audio_queue.get()
+                await audio_processor.process_audio(chunk)
+
+    flusher  = asyncio.create_task(_flusher(state, mqtt_client))
+    reloader = asyncio.create_task(_speaker_reloader(state))
+    try:
+        await asyncio.gather(_send_audio(), _receive_results())
+    finally:
+        flusher.cancel()
+        reloader.cancel()
 
 
 def run_async_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
@@ -367,17 +374,17 @@ def run_async_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
 # ── Entry point ───────────────────────────────────────────────────────────────
 
 def main() -> None:
+    from whisperlivekit import TranscriptionEngine
     state       = BridgeState()
     mqtt_client = build_mqtt_client()
+    engine      = TranscriptionEngine(model_size="large-v3", lan="en", diarization=False)
 
-    ws_thread = threading.Thread(
-        target=run_async_loop, args=(state, mqtt_client), daemon=True
-    )
-    ws_thread.start()
-    print(f"[Bridge] Speaker names loaded from {SPEAKERS_FILE}")
-    print("[Bridge] Audio pipeline running — speaker admin at http://localhost:8001")
-    print("[Bridge] Close this window to quit")
+    def _run():
+        asyncio.run(audio_processor_loop(state, mqtt_client, engine))
 
+    ws_thread = threading.Thread(target=_run, daemon=True)
+    ws_thread.start()
+    print("[Bridge] Audio pipeline running")
     try:
         ws_thread.join()
     except KeyboardInterrupt:

+ 13 - 1
bridge/requirements.txt

@@ -1,9 +1,21 @@
+# Core bridge dependencies
 paho-mqtt>=2.0
 websockets>=12.0
 sounddevice>=0.4.6
-numpy>=1.24
 fastapi>=0.111
 uvicorn>=0.29
 python-multipart>=0.0.9
 miniaudio>=1.59
 imageio-ffmpeg>=0.4.8
+
+# numpy — floor set by pyannote-core/metrics requirement
+numpy>=2.0
+
+# ctranslate2 pinned — 4.5.0 confirmed working with RTX 5060 Ti / CUDA 13 + CUDA 12 libs
+ctranslate2==4.5.0
+setuptools<82
+
+# CUDA 12 runtime libraries (required on CUDA 13 drivers for ctranslate2 compatibility)
+nvidia-cublas-cu12
+nvidia-cudnn-cu12
+nvidia-cuda-runtime-cu12

+ 2 - 1
start.bat

@@ -72,7 +72,8 @@ echo.
 
 :: Activate venv and launch WhisperLiveKit via the compatibility launcher
 :: The launcher patches torchaudio for diart and makes ffmpeg available.
-start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && echo Starting WhisperLiveKit (%WHISPER_MODEL%) with speaker diarization... && python bridge\whisper_launcher.py --model %WHISPER_MODEL% --lan en --diarization-backend diart"
+:: start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && echo Starting WhisperLiveKit (%WHISPER_MODEL%) with speaker diarization... && python bridge\whisper_launcher.py --model %WHISPER_MODEL% --language en --backend faster-whisper --diarization-backend diart"
+start "Whisper Transcription Server" cmd /k "call .venv\Scripts\activate.bat && set HF_TOKEN=%HF_TOKEN% && set IMAGEIO_FFMPEG_EXE=.venv\Lib\site-packages\imageio_ffmpeg\binaries\ffmpeg-win-x86_64-v7.1.exe && wlk --model %WHISPER_MODEL% --language en --backend faster-whisper --diarization-backend diart"
 
 :: Give Whisper more time on first run — diarization model downloads ~500 MB
 timeout /t 15 /nobreak >nul