1 mesiac pred · ee90055248
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,9 +2,13 @@
 
				 
			
 
				 This file provides context for AI-assisted development sessions on the Church Live Transcription Display project.
			
 
				 
			
 
				+---
			
 
				+
			
 
				 ## Project Summary
			
 
				 
			
 
				-A live captioning system for deaf/hard-of-hearing church congregants. A Windows PC captures audio, transcribes it locally using Whisper (GPU-accelerated), and sends rolling text over MQTT to an ESP32 driving a large e-ink display. No cloud services. No internet required during operation.
			
 
				+A live captioning system for deaf/hard-of-hearing church congregants. A Windows PC captures audio, transcribes it locally using Whisper (GPU-accelerated), performs real-time speaker diarization, maps anonymous speaker IDs to real names, and sends speaker-tagged rolling text over MQTT to an ESP32 driving a large e-ink display. No cloud services. No internet required during operation.
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Architecture
			
 
				 
			
@@ -12,33 +16,67 @@ A live captioning system for deaf/hard-of-hearing church congregants. A Windows
 
				 [Audio source]
			
 
				      ↓ (USB mic or mixer line-in)
			
 
				 [Windows PC]
			
 
				-  ├── WhisperLiveKit (local Whisper server, WebSocket on port 8000)
			
 
				+  ├── WhisperLiveKit
			
 
				+  │     ├── Whisper large-v3 (transcription)
			
 
				+  │     └── Streaming Sortformer (real-time speaker diarization)
			
 
				+  │     WebSocket output: ws://localhost:8000/asr
			
 
				+  │
			
 
				   ├── Mosquitto MQTT broker (port 1883)
			
 
				-  └── bridge.py (Python: WS subscriber → sentence buffer → MQTT publisher)
			
 
				-     ↓ (WiFi / MQTT topic: display/text)
			
 
				-[ESP32-WROOM or S3]
			
 
				+  │
			
 
				+  ├── bridge.py
			
 
				+  │     ├── Subscribes to Whisper WebSocket
			
 
				+  │     ├── Receives: {text, speaker_id, is_final, ...}
			
 
				+  │     ├── Resolves speaker_id → name via speaker_registry
			
 
				+  │     ├── Buffers text to sentence boundary
			
 
				+  │     └── Publishes JSON payload to MQTT topic display/text
			
 
				+  │
			
 
				+  └── admin_ui.py (Tkinter)
			
 
				+        ├── Shows "New speaker detected" prompts
			
 
				+        ├── Operator types name once per unknown speaker
			
 
				+        └── Updates speaker_registry in real time
			
 
				+
			
 
				+     ↓ WiFi / MQTT
			
 
				+[ESP32-S3]
			
 
				   └── Waveshare 7.5" V2 e-ink display (SPI, GxEPD2 library)
			
 
				 ```
			
 
				 
			
 
				+---
			
 
				+
			
 
				 ## PC Environment
			
 
				 
			
 
				 - OS: Windows 10/11
			
 
				-- GPU: NVIDIA RTX series (tested with RTX 4070 Super)
			
 
				+- GPU: NVIDIA RTX series (RTX 4070 Super available)
			
 
				 - Python: 3.11+
			
 
				 - MQTT broker: Mosquitto (localhost:1883)
			
 
				-- Whisper server: WhisperLiveKit (`wlk --model large-v3 --language en`)
			
 
				-- Whisper WebSocket: `ws://localhost:8000/asr`
			
 
				+- Whisper server: WhisperLiveKit with `--diarization` flag
			
 
				+  - Command: `whisperlivekit-server --model large-v3 --diarization --language en`
			
 
				+  - WebSocket: `ws://localhost:8000/asr`
			
 
				+- Diarization model: Streaming Sortformer (SOTA 2025, via WhisperLiveKit)
			
 
				+  - Fallback: Diart (more stable, slightly older, also integrated in WhisperLiveKit)
			
 
				+  - Requires pyannote model access (HuggingFace token + model agreement)
			
 
				+
			
 
				+### WhisperLiveKit Diarization Setup Notes
			
 
				+- Install with diarization extra: `pip install whisperlivekit[diarization-sortformer]`
			
 
				+- Sortformer and Voxtral extras are incompatible — install in separate environments
			
 
				+- Must accept HuggingFace user conditions for:
			
 
				+  - `pyannote/segmentation`
			
 
				+  - `pyannote/segmentation-3.0`
			
 
				+  - `pyannote/embedding`
			
 
				+- Login: `huggingface-cli login`
			
 
				+- Streaming Sortformer is marked as in active development — fallback to Diart if unstable
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## ESP32 Environment
			
 
				 
			
 
				-- Board: ESP32-WROOM-32 or ESP32-S3
			
 
				-- Framework: Arduino (via PlatformIO)
			
 
				+- Board: ESP32-S3 (PSRAM required for large font glyph buffers)
			
 
				+- Framework: Arduino via PlatformIO
			
 
				 - Display: Waveshare 7.5" V2 (800×480 pixels, black/white)
			
 
				 - Display library: GxEPD2
			
 
				-- MQTT library: PubSubClient
			
 
				+- MQTT library: PubSubClient (increase buffer: `client.setBufferSize(512)`)
			
 
				 - Build tool: PlatformIO (VSCode)
			
 
				 
			
 
				-### SPI Wiring (Waveshare 7.5" V2 to ESP32)
			
 
				+### SPI Wiring (Waveshare 7.5" V2 → ESP32)
			
 
				 
			
 
				 | Display Pin | ESP32 Pin |
			
 
				 |---|---|
			
@@ -51,79 +89,168 @@ A live captioning system for deaf/hard-of-hearing church congregants. A Windows
 
				 | GND | GND |
			
 
				 | VCC | 3.3V |
			
 
				 
			
 
				+---
			
 
				+
			
 
				 ## MQTT Topics
			
 
				 
			
 
				 | Topic | Direction | Payload |
			
 
				 |---|---|---|
			
 
				-| `display/text` | PC → ESP32 | JSON: `{"lines": ["line1", "line2", "line3"]}` |
			
 
				-| `display/clear` | PC → ESP32 | Empty / any |
			
 
				+| `display/text` | PC → ESP32 | JSON: see payload schema below |
			
 
				+| `display/clear` | PC → ESP32 | Empty / any value |
			
 
				 | `display/status` | ESP32 → PC | JSON: `{"ready": true}` |
			
 
				 
			
 
				+### display/text Payload Schema
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "speaker": "PASTOR JOHN",
			
 
				+  "speaker_changed": true,
			
 
				+  "lines": [
			
 
				+    "...and He said unto them, go",
			
 
				+    "into all the world and preach"
			
 
				+  ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+- `speaker`: resolved name string, or `null` if unknown/unnamed
			
 
				+- `speaker_changed`: `true` triggers full display refresh + speaker header redraw
			
 
				+- `lines`: array of pre-wrapped strings, max 40 chars each, max 3 items
			
 
				+
			
 
				+---
			
 
				+
			
 
				 ## Key Files
			
 
				 
			
 
				-- `bridge/bridge.py` — Main Python bridge. Connects to Whisper WS, buffers text, publishes to MQTT.
			
 
				-- `esp32/src/main.cpp` — ESP32 firmware. WiFi + MQTT client, renders text to e-ink.
			
 
				-- `esp32/platformio.ini` — Board and library config.
			
 
				+### `bridge/bridge.py`
			
 
				+Main orchestrator. Connects to Whisper WebSocket and Mosquitto. Receives incremental diarized transcription. Buffers text. Resolves speaker names. Publishes MQTT payloads.
			
 
				+
			
 
				+**WebSocket message fields from WhisperLiveKit (with diarization):**
			
 
				+```json
			
 
				+{
			
 
				+  "text": "and He said unto them",
			
 
				+  "speaker": "SPEAKER_0",
			
 
				+  "is_final": true,
			
 
				+  "start": 12.4,
			
 
				+  "end": 15.1
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**Bridge logic:**
			
 
				+1. On each `is_final` segment, extract `text` and `speaker`
			
 
				+2. Resolve `speaker` → name via `speaker_registry`
			
 
				+3. If speaker is unknown, notify `admin_ui` (via queue or callback)
			
 
				+4. Accumulate text into rolling buffer
			
 
				+5. On sentence boundary or 4s timeout, word-wrap and publish to MQTT
			
 
				+6. Set `speaker_changed: true` if speaker differs from last published segment
			
 
				+
			
 
				+### `bridge/speaker_registry.py`
			
 
				+Manages the session-persistent mapping of `SPEAKER_N` IDs to real names.
			
 
				+
			
 
				+```python
			
 
				+# Core interface
			
 
				+registry = SpeakerRegistry()
			
 
				+registry.assign(speaker_id="SPEAKER_0", name="Pastor John")
			
 
				+name = registry.resolve("SPEAKER_0")  # Returns "Pastor John" or None
			
 
				+registry.is_known("SPEAKER_1")        # Returns False
			
 
				+registry.save_session()               # Persist to JSON for the session
			
 
				+```
			
 
				 
			
 
				-## Design Constraints & Decisions
			
 
				+- Session data stored in `bridge/sessions/YYYY-MM-DD.json`
			
 
				+- v2: will also store voice embeddings per speaker for cross-session recognition
			
 
				 
			
 
				-### Refresh Strategy
			
 
				-- Full e-ink refresh: ~1.5–2 seconds with flash. Acceptable for sentence-level updates.
			
 
				-- Partial refresh: ~300ms, some ghosting. Use for rapid updates if needed.
			
 
				-- **Current approach**: buffer until sentence boundary or 4-second silence, then push full screen update.
			
 
				-- Display shows 3 lines of text. New text pushes old text up; oldest line drops off.
			
 
				+### `bridge/admin_ui.py`
			
 
				+Lightweight Tkinter window. Runs in a separate thread alongside bridge.py.
			
 
				 
			
 
				-### Text Formatting
			
 
				-- Target font size: large enough to read at 3–5 metres (approx 36–48px equivalent at 800px wide)
			
 
				-- At ~800px wide with a large font: approximately 35–45 characters per line
			
 
				-- Lines wrap at word boundaries
			
 
				-- All caps optional for readability (configurable)
			
 
				+**Behaviour:**
			
 
				+- Displays current speaker label and resolved name (or "Unknown")
			
 
				+- When a new unknown `SPEAKER_N` is detected, shows a prompt: "New speaker detected. Who is this?"
			
 
				+- Operator types name and hits Enter
			
 
				+- Calls `registry.assign()` and the display updates immediately
			
 
				+- Also shows a manual override: operator can retype any name at any time
			
 
				 
			
 
				-### Audio Input
			
 
				-- Preferred: direct feed from church mixing desk (line-in or USB audio interface)
			
 
				-- Fallback: USB condenser microphone near pulpit/lectern
			
 
				-- Whisper performs best with clean, low-noise input
			
 
				-- VAD (Voice Activity Detection) in WhisperLiveKit handles silence automatically
			
 
				+### `esp32/src/main.cpp`
			
 
				+ESP32 firmware. WiFi + MQTT client. Receives JSON payloads and renders to e-ink.
			
 
				 
			
 
				-### Network
			
 
				-- All on local WiFi (church LAN or dedicated hotspot)
			
 
				-- MQTT broker on Windows PC
			
 
				-- ESP32 connects to same WiFi network
			
 
				-- Static IP recommended for ESP32 to avoid reconnection delays
			
 
				+**Display rendering logic:**
			
 
				+- On `speaker_changed: true`: full refresh, print speaker name in large CAPS on line 1, then print text lines below
			
 
				+- On `speaker_changed: false`: partial refresh, overwrite text lines only (speaker header stays)
			
 
				+- Track partial refresh count; force full refresh every 10 cycles to clear ghosting
			
 
				+- Font: large enough for ~40 chars across 800px (approx FreeSans 18–24pt at this resolution)
			
 
				 
			
 
				-## Bridge Script Logic (bridge.py)
			
 
				+---
			
 
				+
			
 
				+## Display Layout (800×480 pixels)
			
 
				 
			
 
				 ```
			
 
				-1. Connect to Mosquitto MQTT broker
			
 
				-2. Connect to WhisperLiveKit WebSocket (ws://localhost:8000/asr)
			
 
				-3. Receive partial transcription updates
			
 
				-4. Accumulate words into a sentence buffer
			
 
				-5. On sentence-end signal (or timeout):
			
 
				-   a. Word-wrap text into lines (max ~40 chars each)
			
 
				-   b. Maintain a rolling 3-line buffer
			
 
				-   c. Publish JSON payload to MQTT topic display/text
			
 
				-6. On reconnect events: re-establish WS and MQTT connections
			
 
				+┌────────────────────────────────────────────────┐  ← full width
			
 
				+│ PASTOR JOHN                                    │  ← speaker name, top ~80px, bold/large
			
 
				+│────────────────────────────────────────────────│
			
 
				+│ ...and He said unto them, go into all the      │  ← text line 1
			
 
				+│ world and preach the gospel to every           │  ← text line 2
			
 
				+│ creature. He that believeth and is baptised    │  ← text line 3
			
 
				+└────────────────────────────────────────────────┘
			
 
				 ```
			
 
				 
			
 
				-## Known Issues / Open Questions
			
 
				+- Speaker name zone: top ~80px
			
 
				+- Text zone: remaining ~380px, 3 lines at ~120px each
			
 
				+- On speaker change: full clear, redraw both zones
			
 
				+- On same speaker new text: partial refresh text zone only
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Speaker Diarization Notes
			
 
				 
			
 
				-- [ ] Partial refresh ghosting threshold — how many partial refreshes before forcing a full clear?
			
 
				-- [ ] Whisper latency with large-v3 model — may need to test medium or distil-large-v3 for lower latency
			
 
				-- [ ] Line-wrapping edge cases with long words (e.g. proper nouns, scripture references)
			
 
				-- [ ] ESP32 RAM: WROOM has 520KB; large font bitmaps may require PSRAM (use S3 variant)
			
 
				-- [ ] WiFi reconnection handling in firmware — need watchdog/retry logic
			
 
				+### v1 — Operator-Assisted Naming
			
 
				+- Zero prep before service
			
 
				+- admin_ui.py shows prompt when new `SPEAKER_N` appears
			
 
				+- Operator at sound desk types name (e.g. "Pastor John") once
			
 
				+- Registry holds the mapping for the entire session
			
 
				 
			
 
				-## Development Notes
			
 
				+### v2 — Voice Enrolment (future)
			
 
				+- Record 10–30s of each speaker saying natural speech (not word lists)
			
 
				+- Extract speaker embedding using pyannote `SpeakerEmbedding` pipeline
			
 
				+- Store embedding in `bridge/profiles/<name>.npy`
			
 
				+- At runtime, compare incoming `SPEAKER_N` embedding to stored profiles
			
 
				+- If cosine similarity > threshold (~0.85), auto-assign name
			
 
				+- Fall back to operator prompt if no match above threshold
			
 
				 
			
 
				-- WhisperLiveKit WebSocket returns incremental JSON with `text` and `is_final` fields
			
 
				-- GxEPD2 supports both full and partial refresh; partial requires `setPartialWindow()`
			
 
				-- PubSubClient default packet size is 128 bytes — must increase to handle JSON payloads (~200 bytes)
			
 
				-- Use `client.setBufferSize(512)` in PubSubClient setup
			
 
				+### Known Diarization Constraints
			
 
				+- Streaming Sortformer tracks 2–4+ speakers reliably
			
 
				+- Works best with clean, low-noise audio — direct mixer feed strongly preferred
			
 
				+- Background music (worship) may confuse diarization; consider muting music channel on the transcription input
			
 
				+- Congregation responses ("Amen", "Hallelujah") may appear as brief unknown speakers — consider a minimum-duration filter (~2s) before triggering a speaker prompt
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Design Constraints & Open Questions
			
 
				+
			
 
				+- [ ] Streaming Sortformer stability in WhisperLiveKit — test early; fall back to Diart if needed
			
 
				+- [ ] Minimum speaker segment duration before triggering name prompt (avoid congregation one-liners)
			
 
				+- [ ] Partial refresh ghosting — determine optimal full-refresh interval for the chosen display
			
 
				+- [ ] ESP32-S3 PSRAM: confirm font glyph buffer fits; WROOM (no PSRAM) likely insufficient for large fonts
			
 
				+- [ ] Word-wrap edge cases: long proper nouns, scripture references, place names
			
 
				+- [ ] Session save/restore: if PC crashes mid-service, can operator reload speaker assignments quickly?
			
 
				+- [ ] Audio routing on Windows: ensure Whisper receives the mixer/mic channel, not system audio
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Testing Approach
			
 
				 
			
 
				-1. Test Whisper server standalone: speak into mic, verify text in browser at `http://localhost:8000`
			
 
				-2. Test MQTT: use MQTT Explorer or `mosquitto_sub` to verify bridge publishes correctly
			
 
				-3. Test ESP32 display: send static MQTT messages manually before connecting bridge
			
 
				-4. End-to-end: full pipeline test with recorded sermon audio
			
 
				-5. In-situ trial: 1–2 Sunday services with a volunteer congregant providing feedback
			
 
				+1. **Whisper standalone**: speak into mic, verify text output in browser at `http://localhost:8000`
			
 
				+2. **Diarization standalone**: two people alternate speaking, verify `SPEAKER_0` / `SPEAKER_1` labels in WS output
			
 
				+3. **Registry + bridge**: run bridge.py, verify name prompts appear in admin_ui.py, verify MQTT payloads via `mosquitto_sub -t display/#`
			
 
				+4. **ESP32 display**: send static MQTT messages manually before connecting bridge
			
 
				+5. **End-to-end**: full pipeline test with recorded sermon audio (mix of 2–3 speakers)
			
 
				+6. **In-situ trial**: 1–2 Sunday services with a volunteer congregant providing feedback
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Development Sequence (Suggested)
			
 
				+
			
 
				+1. Get WhisperLiveKit running with `--diarization` flag, confirm WS output includes speaker labels
			
 
				+2. Write `bridge.py` (transcription only, no diarization yet) → verify MQTT publish works
			
 
				+3. Add `speaker_registry.py` and `admin_ui.py` → test name mapping loop
			
 
				+4. Integrate diarization into bridge — handle `speaker_changed` logic
			
 
				+5. Write ESP32 firmware — basic text display
			
 
				+6. Add speaker header zone and refresh logic to ESP32 firmware
			
 
				+7. Full end-to-end test on bench
			
 
				+8. Church trial
			
--- a/README.md
+++ b/README.md
@@ -1,94 +1,187 @@
 
				 # Church Live Transcription Display
			
 
				 
			
 
				-A live speech-to-text system for deaf and hard-of-hearing congregants, displaying real-time transcriptions on an e-ink screen driven by an ESP32 microcontroller.
			
 
				+A live speech-to-text system for deaf and hard-of-hearing congregants, displaying real-time transcriptions with speaker identification on an e-ink screen driven by an ESP32 microcontroller.
			
 
				 
			
 
				 ## Overview
			
 
				 
			
 
				-Audio from the church service is captured on a Windows PC, transcribed locally using a Whisper-based model, and the resulting text is pushed over WiFi/MQTT to an ESP32 that drives a large e-ink display. The display is readable in any lighting condition and requires no screen brightness — ideal for a church environment.
			
 
				+Audio from the church service is captured on a Windows PC, transcribed and speaker-diarized locally using WhisperLiveKit, and the resulting speaker-tagged text is pushed over WiFi/MQTT to an ESP32 that drives a large e-ink display. The display shows who is speaking alongside what they are saying, updates in real time, and requires no internet connection.
			
 
				 
			
 
				 ```
			
 
				-[Microphone / Mixer] → [Windows PC: Whisper transcription]
			
 
				-                              ↓ MQTT over WiFi
			
 
				-                       [ESP32 + e-ink display]
			
 
				+[Microphone / Mixer]
			
 
				+        ↓
			
 
				+[Windows PC]
			
 
				+  ├── WhisperLiveKit  (transcription + speaker diarization)
			
 
				+  ├── Mosquitto MQTT broker
			
 
				+  ├── bridge.py       (WebSocket → name mapping → MQTT)
			
 
				+  └── Speaker Admin UI (operator names speakers live)
			
 
				+        ↓ MQTT / WiFi
			
 
				+[ESP32 + e-ink display]
			
 
				 ```
			
 
				 
			
 
				 ## Goals
			
 
				 
			
 
				 - Real-time captions with minimal latency (target: < 3 seconds end-to-end)
			
 
				+- Speaker identification — display who is speaking when the speaker changes
			
 
				+- Named speakers — operator maps anonymous speaker IDs to real names during service
			
 
				+- Future: voice enrolment so names are matched automatically from pre-recorded samples
			
 
				 - Runs entirely on local network — no cloud dependency
			
 
				 - Readable at distance with large font (36–48pt equivalent)
			
 
				-- Displays 3–4 lines of rolling text, clearing as new content arrives
			
 
				 - Low cost, low complexity hardware
			
 
				 
			
 
				+---
			
 
				+
			
 
				+## Speaker Identification
			
 
				+
			
 
				+### How It Works
			
 
				+
			
 
				+WhisperLiveKit includes **Streaming Sortformer** (SOTA 2025), a real-time speaker diarization model developed by NVIDIA. It runs alongside Whisper transcription and tags each segment of speech with an anonymous speaker label (`SPEAKER_0`, `SPEAKER_1`, etc.).
			
 
				+
			
 
				+A name mapping layer in the bridge script translates these anonymous labels into real names, which are then included in the MQTT payload sent to the display.
			
 
				+
			
 
				+### Display Format
			
 
				+
			
 
				+When a speaker changes, their name is shown as a header line above their words. The name is not repeated on every line — only when the speaker changes.
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────┐
			
 
				+│ PASTOR JOHN                     │
			
 
				+│ ...and He said unto them, go    │
			
 
				+│ into all the world and preach   │
			
 
				+└─────────────────────────────────┘
			
 
				+
			
 
				+┌─────────────────────────────────┐
			
 
				+│ MARY (READER)                   │
			
 
				+│ A reading from Luke chapter 4,  │
			
 
				+│ verse 18...                     │
			
 
				+└─────────────────────────────────┘
			
 
				+```
			
 
				+
			
 
				+### Speaker Naming — Two Approaches
			
 
				+
			
 
				+**v1 — Operator-Assisted Naming (implemented first)**
			
 
				+
			
 
				+A simple admin UI runs on the PC alongside the bridge script. When a new unknown speaker is detected, the operator sees a prompt ("New speaker detected — who is this?") and types the name once. That name is stored for the session and used every time that speaker is detected again.
			
 
				+
			
 
				+- No setup required before the service
			
 
				+- Works from the very first Sunday
			
 
				+- Operator (e.g. sound desk volunteer) assigns names as speakers appear
			
 
				+
			
 
				+**v2 — Voice Enrolment (future upgrade)**
			
 
				+
			
 
				+Before the service, a short voice sample (10–30 seconds) is recorded for each expected speaker. The bridge script compares incoming speaker embeddings against enrolled voices and automatically assigns the correct name without operator input.
			
 
				+
			
 
				+- No operator intervention during the service
			
 
				+- More accurate for recurring speakers (pastor, regular readers)
			
 
				+- Enrolled voice profiles persist week to week
			
 
				+
			
 
				+### Typical Church Speakers
			
 
				+
			
 
				+| Role | Frequency | Notes |
			
 
				+|---|---|---|
			
 
				+| Pastor / Preacher | Every service | Primary speaker, longest segments |
			
 
				+| Worship leader | Most services | May overlap with congregation response |
			
 
				+| Reader / Scripture | Weekly | Short, distinct segments |
			
 
				+| Visiting speaker | Occasionally | New enrolment or operator naming needed |
			
 
				+| Announcements | Weekly | Often the same person each week |
			
 
				+
			
 
				+---
			
 
				+
			
 
				 ## System Components
			
 
				 
			
 
				 ### PC Side (Windows)
			
 
				-- **WhisperLiveKit** — local GPU-accelerated speech-to-text server with WebSocket output
			
 
				-- **Mosquitto** — lightweight MQTT broker running on the same PC
			
 
				-- **Python bridge script** — subscribes to Whisper WebSocket, buffers sentences, publishes to MQTT
			
 
				+- **WhisperLiveKit** — local GPU-accelerated transcription + diarization server
			
 
				+- **Mosquitto** — lightweight MQTT broker (same PC, port 1883)
			
 
				+- **bridge.py** — WebSocket subscriber, speaker name mapper, MQTT publisher
			
 
				+- **admin_ui.py** — lightweight operator interface for live speaker naming
			
 
				+- **speaker_registry.py** — manages speaker ID ↔ name mappings and voice enrolment
			
 
				 
			
 
				 ### ESP32 Side
			
 
				-- **ESP32 (WROOM or S3)** — WiFi-enabled microcontroller
			
 
				+- **ESP32-S3** — WiFi microcontroller (S3 preferred — PSRAM needed for large font bitmaps)
			
 
				 - **Waveshare e-ink display** — 7.5" V2 (800×480) or larger
			
 
				-- **GxEPD2 / Adafruit GFX** — display driver library
			
 
				-- **PubSubClient** — MQTT client library for Arduino
			
 
				+- **GxEPD2** — display driver library
			
 
				+- **PubSubClient** — MQTT client library
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Hardware
			
 
				 
			
 
				 | Component | Model | Notes |
			
 
				 |---|---|---|
			
 
				-| Microcontroller | ESP32-WROOM-32 or ESP32-S3 | S3 preferred for more RAM |
			
 
				+| Microcontroller | ESP32-S3 | PSRAM required for large font bitmaps |
			
 
				 | Display | Waveshare 7.5" V2 e-Paper | 800×480, supports partial refresh |
			
 
				-| PC | Windows 10/11 with NVIDIA GPU | RTX series recommended for real-time Whisper |
			
 
				-| Microphone | USB condenser or mixer feed | Direct mixer feed preferred for clean audio |
			
 
				+| PC | Windows 10/11 with NVIDIA GPU | RTX series recommended |
			
 
				+| Microphone | USB condenser or direct mixer feed | Mixer feed preferred for clean diarization |
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Key Design Decisions
			
 
				 
			
 
				-### Text Buffering Strategy
			
 
				-E-ink full refresh takes ~1–2 seconds. Rather than updating word-by-word, the bridge script accumulates text until a natural pause (sentence boundary or ~5 seconds of speech), then pushes a complete "screen's worth" as a single MQTT message. Partial refresh mode can be used for faster but ghosting-prone updates.
			
 
				+### Text & Speaker Buffering
			
 
				+The bridge script accumulates text until a sentence boundary or natural pause (~4s), then checks whether the speaker has changed. If the speaker is unchanged, only new text lines are pushed. If the speaker has changed, a full new payload is sent including the speaker name header, triggering a full display refresh.
			
 
				 
			
 
				 ### Display Layout
			
 
				-- 3–4 lines of large text
			
 
				-- Most recent line at bottom, scrolling upward
			
 
				-- Simple black-on-white, no graphics
			
 
				-- Font size prioritises readability at 3–5 metres
			
 
				+- **Line 1:** Speaker name in CAPS — printed only on speaker change
			
 
				+- **Lines 2–4:** Rolling transcription text, wrapping at ~40 chars per line
			
 
				+- On speaker change: full screen clear then redraw with new name header
			
 
				+- Font targets readability at 3–5 metres
			
 
				+
			
 
				+### E-ink Refresh Strategy
			
 
				+- Speaker change → **full refresh** (~1.5s flash — clean slate, acceptable at speaker transition)
			
 
				+- Same speaker, new text → **partial refresh** (~300ms, minor ghosting)
			
 
				+- Force full refresh every 10 partial refreshes to clear accumulated ghosting
			
 
				 
			
 
				 ### Network
			
 
				-- All traffic stays on local WiFi network
			
 
				-- MQTT broker on PC (port 1883)
			
 
				-- No internet required during operation
			
 
				+- All traffic on local WiFi (church LAN or dedicated hotspot)
			
 
				+- MQTT broker on Windows PC (port 1883)
			
 
				+- Static IP recommended for ESP32 to avoid reconnection delays
			
 
				+
			
 
				+---
			
 
				 
			
 
				 ## Repository Structure
			
 
				 
			
 
				 ```
			
 
				 /
			
 
				-├── README.md               — This file
			
 
				-├── CLAUDE.md               — AI assistant context for development sessions
			
 
				+├── README.md                     — This file
			
 
				+├── CLAUDE.md                     — AI assistant context for development sessions
			
 
				 ├── bridge/
			
 
				-│   └── bridge.py           — Python: Whisper WebSocket → MQTT publisher
			
 
				+│   ├── bridge.py                 — Main bridge: Whisper WS → name map → MQTT
			
 
				+│   ├── speaker_registry.py       — Speaker ID ↔ name mapping and voice enrolment
			
 
				+│   └── admin_ui.py               — Operator UI for live speaker naming (Tkinter)
			
 
				 ├── esp32/
			
 
				 │   ├── src/
			
 
				-│   │   └── main.cpp        — ESP32 Arduino firmware
			
 
				-│   └── platformio.ini      — PlatformIO build config
			
 
				+│   │   └── main.cpp              — ESP32 Arduino firmware
			
 
				+│   └── platformio.ini            — PlatformIO build config
			
 
				 └── docs/
			
 
				-    ├── hardware-wiring.md  — SPI pin connections for display
			
 
				-    └── setup.md            — Installation and configuration guide
			
 
				+    ├── hardware-wiring.md        — SPI pin connections for Waveshare display
			
 
				+    ├── setup.md                  — Installation and configuration guide
			
 
				+    └── speaker-enrolment.md     — Guide for recording and enrolling voice samples (v2)
			
 
				 ```
			
 
				 
			
 
				+---
			
 
				+
			
 
				 ## Reference Projects
			
 
				 
			
 
				-- [WhisperLiveKit](https://github.com/QuentinFuxa/WhisperLiveKit) — real-time Whisper server with WebSocket API
			
 
				-- [reriiasu/speech-to-text](https://github.com/reriiasu/speech-to-text) — faster-whisper with VAD and WebSocket output
			
 
				-- [denwilliams/mqtt-epaper](https://github.com/denwilliams/mqtt-epaper) — ESP32 e-paper display driven by MQTT JSON
			
 
				+- [WhisperLiveKit](https://github.com/QuentinFuxa/WhisperLiveKit) — real-time Whisper + Streaming Sortformer diarization
			
 
				+- [NVIDIA Streaming Sortformer](https://developer.nvidia.com/blog/identify-speakers-in-meetings-calls-and-voice-apps-in-real-time-with-nvidia-streaming-sortformer/) — the diarization model integrated into WhisperLiveKit
			
 
				+- [pyannote.audio](https://github.com/pyannote/pyannote-audio) — fallback diarization (Diart integration in WhisperLiveKit)
			
 
				+- [denwilliams/mqtt-epaper](https://github.com/denwilliams/mqtt-epaper) — ESP32 e-paper display driven by MQTT
			
 
				 - [cuci90/epaper_mqtt_esp32](https://github.com/cuci90/epaper_mqtt_esp32) — ESP32 Waveshare display MQTT template
			
 
				 
			
 
				+---
			
 
				+
			
 
				 ## Status
			
 
				 
			
 
				 🟡 **Planning / Research phase**
			
 
				 
			
 
				 - [x] Architecture defined
			
 
				-- [ ] Python bridge script
			
 
				-- [ ] ESP32 firmware
			
 
				-- [ ] Hardware wiring and test
			
 
				+- [x] Speaker diarization approach selected (WhisperLiveKit + Streaming Sortformer)
			
 
				+- [x] Speaker naming strategy defined (operator-assisted v1, voice enrolment v2)
			
 
				+- [ ] Python bridge script (transcription only)
			
 
				+- [ ] Speaker name mapping layer (`speaker_registry.py`)
			
 
				+- [ ] Operator admin UI (`admin_ui.py`)
			
 
				+- [ ] ESP32 firmware — basic text display
			
 
				+- [ ] ESP32 firmware — speaker header layout + refresh logic
			
 
				+- [ ] Hardware wiring and bench test
			
 
				 - [ ] End-to-end integration test
			
 
				-- [ ] Church deployment trial
			
 
				+- [ ] Voice enrolment system (v2)
			
 
				+- [ ] Church deployment trial
			
--- a/bridge/bridge.py
+++ b/bridge/bridge.py
@@ -0,0 +1,361 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+bridge.py — Church Live Transcription Bridge
			
 
				+
			
 
				+Streams microphone audio to WhisperLiveKit (ws://localhost:8000/asr),
			
 
				+receives transcription + speaker diarization, buffers sentences, and
			
 
				+publishes rolling 3-line JSON to Mosquitto MQTT for the e-ink display.
			
 
				+
			
 
				+Start WhisperLiveKit with:
			
 
				+    wlk --model large-v3 --language en --diarization
			
 
				+
			
 
				+Run this script:
			
 
				+    python bridge.py
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import re
			
 
				+import textwrap
			
 
				+import threading
			
 
				+import time
			
 
				+from collections import Counter
			
 
				+
			
 
				+import numpy as np
			
 
				+import paho.mqtt.client as mqtt
			
 
				+import sounddevice as sd
			
 
				+import websockets
			
 
				+import tkinter as tk
			
 
				+from tkinter import ttk
			
 
				+
			
 
				+# ── Configuration ─────────────────────────────────────────────────────────────
			
 
				+
			
 
				+MQTT_HOST        = "localhost"
			
 
				+MQTT_PORT        = 1883
			
 
				+MQTT_TOPIC_TEXT  = "display/text"
			
 
				+MQTT_TOPIC_CLEAR = "display/clear"
			
 
				+
			
 
				+WS_URL      = "ws://localhost:8000/asr"
			
 
				+SAMPLE_RATE = 16000
			
 
				+CHANNELS    = 1
			
 
				+BLOCKSIZE   = 4096          # ~256 ms per chunk at 16 kHz
			
 
				+
			
 
				+SENTENCE_TIMEOUT = 4.0      # seconds of silence before forcing a flush
			
 
				+MAX_LINE_CHARS   = 38       # characters per line (~24pt font at 800 px wide)
			
 
				+DISPLAY_LINES    = 3
			
 
				+
			
 
				+# ── State ─────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+class BridgeState:
			
 
				+    """All mutable state, protected by a single lock."""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self._lock             = threading.Lock()
			
 
				+        self.speaker_names: dict[str, str] = {}   # "SPEAKER_00" → "Pastor"
			
 
				+        self._current_speaker: str | None  = None
			
 
				+        self._speaker_changed              = False
			
 
				+        self._text_buffer                  = ""
			
 
				+        self._display: list[str]           = [""] * DISPLAY_LINES
			
 
				+        self._last_final_time              = time.monotonic()
			
 
				+
			
 
				+    # ── Speaker name mapping ──────────────────────────────────────────────────
			
 
				+
			
 
				+    def set_speaker_name(self, speaker_id: str, name: str) -> None:
			
 
				+        with self._lock:
			
 
				+            self.speaker_names[speaker_id] = name.strip()
			
 
				+
			
 
				+    def _resolve(self, speaker_id: str | None) -> str | None:
			
 
				+        if not speaker_id:
			
 
				+            return None
			
 
				+        return self.speaker_names.get(speaker_id, speaker_id)
			
 
				+
			
 
				+    # ── Text ingestion ────────────────────────────────────────────────────────
			
 
				+
			
 
				+    def push_final(self, text: str, speaker_id: str | None, mqtt_client: mqtt.Client) -> None:
			
 
				+        """Accept a finalised segment; flush on sentence boundary or speaker change."""
			
 
				+        with self._lock:
			
 
				+            resolved = self._resolve(speaker_id)
			
 
				+
			
 
				+            if resolved != self._current_speaker:
			
 
				+                if self._text_buffer:
			
 
				+                    self._flush(mqtt_client)          # push previous speaker's words first
			
 
				+                self._current_speaker = resolved
			
 
				+                self._speaker_changed = True
			
 
				+
			
 
				+            sep = " " if self._text_buffer else ""
			
 
				+            self._text_buffer += sep + text.strip()
			
 
				+            self._last_final_time = time.monotonic()
			
 
				+
			
 
				+            if _is_sentence_end(text):
			
 
				+                self._flush(mqtt_client)
			
 
				+
			
 
				+    def maybe_timeout_flush(self, mqtt_client: mqtt.Client) -> None:
			
 
				+        with self._lock:
			
 
				+            if self._text_buffer and (time.monotonic() - self._last_final_time) > SENTENCE_TIMEOUT:
			
 
				+                self._flush(mqtt_client)
			
 
				+
			
 
				+    def _flush(self, mqtt_client: mqtt.Client) -> None:
			
 
				+        """Word-wrap buffer → rolling display → publish. Must hold lock."""
			
 
				+        text = self._text_buffer.strip()
			
 
				+        self._text_buffer = ""
			
 
				+        if not text:
			
 
				+            return
			
 
				+
			
 
				+        new_lines: list[str] = []
			
 
				+        if self._speaker_changed and self._current_speaker:
			
 
				+            new_lines.append(f"[{self._current_speaker.upper()}]")
			
 
				+            self._speaker_changed = False
			
 
				+
			
 
				+        new_lines.extend(textwrap.wrap(text, MAX_LINE_CHARS) or [""])
			
 
				+
			
 
				+        self._display.extend(new_lines)
			
 
				+        self._display = self._display[-DISPLAY_LINES:]
			
 
				+        while len(self._display) < DISPLAY_LINES:
			
 
				+            self._display.insert(0, "")
			
 
				+
			
 
				+        payload = json.dumps({"lines": list(self._display)})
			
 
				+        mqtt_client.publish(MQTT_TOPIC_TEXT, payload)
			
 
				+        print(f"[Display] {self._display}")
			
 
				+
			
 
				+    def clear(self, mqtt_client: mqtt.Client) -> None:
			
 
				+        with self._lock:
			
 
				+            self._display         = [""] * DISPLAY_LINES
			
 
				+            self._text_buffer     = ""
			
 
				+            self._current_speaker = None
			
 
				+            self._speaker_changed = False
			
 
				+        mqtt_client.publish(MQTT_TOPIC_CLEAR, "")
			
 
				+        print("[Display] Cleared")
			
 
				+
			
 
				+
			
 
				+# ── Helpers ───────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def _is_sentence_end(text: str) -> bool:
			
 
				+    return bool(re.search(r'[.!?…]\s*$', text.strip()))
			
 
				+
			
 
				+
			
 
				+def _extract_speaker(data: dict) -> str | None:
			
 
				+    """
			
 
				+    Extract speaker ID from a WhisperLiveKit response dict.
			
 
				+    Handles segment-level {"speaker": "SPEAKER_00"} and word-level
			
 
				+    {"words": [{"speaker": "SPEAKER_00", ...}, ...]} formats.
			
 
				+    """
			
 
				+    if "speaker" in data:
			
 
				+        return data["speaker"] or None
			
 
				+
			
 
				+    words = data.get("words", [])
			
 
				+    if words:
			
 
				+        ids = [w.get("speaker") for w in words if w.get("speaker")]
			
 
				+        if ids:
			
 
				+            return Counter(ids).most_common(1)[0][0]
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# ── MQTT ──────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def build_mqtt_client() -> mqtt.Client:
			
 
				+    client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION2)
			
 
				+
			
 
				+    def on_connect(client, userdata, flags, rc, props):
			
 
				+        print("[MQTT] Connected" if rc == 0 else f"[MQTT] Failed: {rc}")
			
 
				+
			
 
				+    def on_disconnect(client, userdata, flags, rc, props):
			
 
				+        print(f"[MQTT] Disconnected ({rc}), will reconnect...")
			
 
				+
			
 
				+    client.on_connect    = on_connect
			
 
				+    client.on_disconnect = on_disconnect
			
 
				+    client.reconnect_delay_set(min_delay=1, max_delay=30)
			
 
				+    client.connect_async(MQTT_HOST, MQTT_PORT)
			
 
				+    client.loop_start()
			
 
				+    return client
			
 
				+
			
 
				+
			
 
				+# ── WebSocket + audio pipeline ────────────────────────────────────────────────
			
 
				+
			
 
				+async def _sender(ws, queue: asyncio.Queue) -> None:
			
 
				+    while not queue.empty():        # drain stale audio before streaming
			
 
				+        queue.get_nowait()
			
 
				+    while True:
			
 
				+        chunk = await queue.get()
			
 
				+        await ws.send(chunk)
			
 
				+
			
 
				+
			
 
				+async def _receiver(ws, state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+    async for message in ws:
			
 
				+        try:
			
 
				+            data = json.loads(message)
			
 
				+        except (json.JSONDecodeError, TypeError):
			
 
				+            continue
			
 
				+
			
 
				+        text     = (data.get("text") or data.get("buffer_transcription") or "").strip()
			
 
				+        is_final = data.get("is_final", False) or data.get("end_of_segment", False)
			
 
				+        speaker  = _extract_speaker(data)
			
 
				+
			
 
				+        if is_final and text:
			
 
				+            print(f"[Whisper] ({speaker or '?'}) {text}")
			
 
				+            state.push_final(text, speaker, mqtt_client)
			
 
				+
			
 
				+
			
 
				+async def _flusher(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+    while True:
			
 
				+        await asyncio.sleep(1.0)
			
 
				+        state.maybe_timeout_flush(mqtt_client)
			
 
				+
			
 
				+
			
 
				+async def audio_ws_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+    audio_queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=120)
			
 
				+    loop = asyncio.get_running_loop()
			
 
				+
			
 
				+    def audio_callback(indata: np.ndarray, frames: int, time_info, status) -> None:
			
 
				+        if status:
			
 
				+            print(f"[Audio] {status}")
			
 
				+        chunk = indata.tobytes()
			
 
				+        def _put():
			
 
				+            try:
			
 
				+                audio_queue.put_nowait(chunk)
			
 
				+            except asyncio.QueueFull:
			
 
				+                pass
			
 
				+        loop.call_soon_threadsafe(_put)
			
 
				+
			
 
				+    with sd.InputStream(
			
 
				+        samplerate=SAMPLE_RATE,
			
 
				+        channels=CHANNELS,
			
 
				+        dtype="int16",
			
 
				+        blocksize=BLOCKSIZE,
			
 
				+        callback=audio_callback,
			
 
				+    ):
			
 
				+        flusher = asyncio.create_task(_flusher(state, mqtt_client))
			
 
				+        try:
			
 
				+            while True:
			
 
				+                try:
			
 
				+                    print(f"[WS] Connecting to {WS_URL} ...")
			
 
				+                    async with websockets.connect(WS_URL, max_size=2**23) as ws:
			
 
				+                        print("[WS] Connected")
			
 
				+                        send_t = asyncio.create_task(_sender(ws, audio_queue))
			
 
				+                        recv_t = asyncio.create_task(_receiver(ws, state, mqtt_client))
			
 
				+                        done, pending = await asyncio.wait(
			
 
				+                            [send_t, recv_t], return_when=asyncio.FIRST_COMPLETED
			
 
				+                        )
			
 
				+                        for t in pending:
			
 
				+                            t.cancel()
			
 
				+                        for t in done:
			
 
				+                            if not t.cancelled() and (exc := t.exception()):
			
 
				+                                print(f"[WS] Task error: {exc}")
			
 
				+                except (websockets.ConnectionClosed, OSError, ConnectionRefusedError) as exc:
			
 
				+                    print(f"[WS] {exc}  — retrying in 3 s...")
			
 
				+                    await asyncio.sleep(3)
			
 
				+        finally:
			
 
				+            flusher.cancel()
			
 
				+
			
 
				+
			
 
				+def run_async_loop(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+    asyncio.run(audio_ws_loop(state, mqtt_client))
			
 
				+
			
 
				+
			
 
				+# ── Speaker name-mapping UI ───────────────────────────────────────────────────
			
 
				+
			
 
				+PRESET_SPEAKERS = [
			
 
				+    ("SPEAKER_00", "Pastor"),
			
 
				+    ("SPEAKER_01", "Reader"),
			
 
				+    ("SPEAKER_02", "Guest"),
			
 
				+    ("SPEAKER_03", "Choir"),
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def run_speaker_ui(state: BridgeState, mqtt_client: mqtt.Client) -> None:
			
 
				+    root = tk.Tk()
			
 
				+    root.title("Transcription Bridge — Speaker Names")
			
 
				+    root.attributes("-topmost", True)
			
 
				+    root.resizable(False, False)
			
 
				+
			
 
				+    tk.Label(root, text="Speaker Name Mapping", font=("Helvetica", 12, "bold")).grid(
			
 
				+        row=0, column=0, columnspan=3, pady=(12, 2), padx=12
			
 
				+    )
			
 
				+    tk.Label(
			
 
				+        root,
			
 
				+        text="Diarization is automatic. Assign readable names to each speaker ID.",
			
 
				+        font=("Helvetica", 9), fg="gray", justify="center",
			
 
				+    ).grid(row=1, column=0, columnspan=3, pady=(0, 8))
			
 
				+
			
 
				+    tk.Label(root, text="Speaker ID",     font=("Helvetica", 10, "bold")).grid(row=2, column=0, padx=8)
			
 
				+    tk.Label(root, text="Friendly Name",  font=("Helvetica", 10, "bold")).grid(row=2, column=1, padx=8)
			
 
				+
			
 
				+    entries: list[tuple[str, tk.Entry]] = []
			
 
				+    for i, (sid, default) in enumerate(PRESET_SPEAKERS):
			
 
				+        tk.Label(root, text=sid, font=("Courier", 10)).grid(row=3+i, column=0, sticky="e", padx=8, pady=3)
			
 
				+        e = tk.Entry(root, width=16, font=("Helvetica", 10))
			
 
				+        e.insert(0, default)
			
 
				+        e.grid(row=3+i, column=1, padx=8, pady=3)
			
 
				+        entries.append((sid, e))
			
 
				+
			
 
				+        def _apply(s=sid, entry=e):
			
 
				+            state.set_speaker_name(s, entry.get())
			
 
				+            print(f"[UI] {s} → {entry.get()!r}")
			
 
				+
			
 
				+        tk.Button(root, text="Apply", command=_apply, width=6).grid(row=3+i, column=2, padx=6)
			
 
				+
			
 
				+    ttk.Separator(root, orient="horizontal").grid(
			
 
				+        row=7, column=0, columnspan=3, sticky="ew", padx=8, pady=8
			
 
				+    )
			
 
				+
			
 
				+    # Custom ID row
			
 
				+    tk.Label(root, text="Custom ID:").grid(row=8, column=0, sticky="e", padx=8)
			
 
				+    cid = tk.Entry(root, width=14, font=("Courier", 10))
			
 
				+    cid.insert(0, "SPEAKER_04")
			
 
				+    cid.grid(row=8, column=1, sticky="w", padx=8, pady=2)
			
 
				+
			
 
				+    tk.Label(root, text="Name:").grid(row=9, column=0, sticky="e", padx=8)
			
 
				+    cname = tk.Entry(root, width=14, font=("Helvetica", 10))
			
 
				+    cname.grid(row=9, column=1, sticky="w", padx=8, pady=2)
			
 
				+
			
 
				+    def _apply_custom():
			
 
				+        s, n = cid.get().strip(), cname.get().strip()
			
 
				+        if s and n:
			
 
				+            state.set_speaker_name(s, n)
			
 
				+            print(f"[UI] Custom: {s} → {n!r}")
			
 
				+
			
 
				+    tk.Button(root, text="Apply", command=_apply_custom, width=6).grid(row=9, column=2, padx=6)
			
 
				+
			
 
				+    ttk.Separator(root, orient="horizontal").grid(
			
 
				+        row=10, column=0, columnspan=3, sticky="ew", padx=8, pady=8
			
 
				+    )
			
 
				+
			
 
				+    def _apply_all():
			
 
				+        for sid, entry in entries:
			
 
				+            state.set_speaker_name(sid, entry.get())
			
 
				+        print("[UI] All names applied")
			
 
				+
			
 
				+    tk.Button(root, text="Apply All Names", width=18, command=_apply_all).grid(
			
 
				+        row=11, column=0, columnspan=2, padx=8, pady=4, sticky="w"
			
 
				+    )
			
 
				+    tk.Button(root, text="Clear Display", width=14, fg="red",
			
 
				+              command=lambda: state.clear(mqtt_client)).grid(
			
 
				+        row=11, column=2, padx=8, pady=4
			
 
				+    )
			
 
				+
			
 
				+    tk.Label(root, text="Speaker labels appear on the display when the speaker changes.",
			
 
				+             font=("Helvetica", 8), fg="gray").grid(
			
 
				+        row=12, column=0, columnspan=3, pady=(0, 10)
			
 
				+    )
			
 
				+
			
 
				+    _apply_all()   # activate defaults immediately
			
 
				+    root.mainloop()
			
 
				+
			
 
				+
			
 
				+# ── Entry point ───────────────────────────────────────────────────────────────
			
 
				+
			
 
				+def main() -> None:
			
 
				+    state       = BridgeState()
			
 
				+    mqtt_client = build_mqtt_client()
			
 
				+
			
 
				+    ws_thread = threading.Thread(
			
 
				+        target=run_async_loop, args=(state, mqtt_client), daemon=True
			
 
				+    )
			
 
				+    ws_thread.start()
			
 
				+    print("[Bridge] Audio pipeline running — close this window to quit")
			
 
				+
			
 
				+    run_speaker_ui(state, mqtt_client)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/bridge/requirements.txt
+++ b/bridge/requirements.txt
@@ -0,0 +1,4 @@
 
				+paho-mqtt>=2.0
			
 
				+websockets>=12.0
			
 
				+sounddevice>=0.4.6
			
 
				+numpy>=1.24
			
--- a/docs/hardware-wiring.md
+++ b/docs/hardware-wiring.md
@@ -0,0 +1,35 @@
 
				+# Hardware Wiring
			
 
				+
			
 
				+## Waveshare 7.5" V2 e-ink → ESP32
			
 
				+
			
 
				+| Display pin | ESP32 GPIO | Notes |
			
 
				+|---|---|---|
			
 
				+| BUSY | 4 | Input — display signals when busy |
			
 
				+| RST | 16 | Reset |
			
 
				+| DC | 17 | Data/command select |
			
 
				+| CS | 5 | SPI chip select |
			
 
				+| CLK | 18 | SPI clock (hardware SPI) |
			
 
				+| DIN | 23 | SPI MOSI |
			
 
				+| GND | GND | |
			
 
				+| VCC | 3.3 V | Do **not** use 5 V |
			
 
				+
			
 
				+> These pin assignments match `main.cpp`. If you need to remap them, change
			
 
				+> the `PIN_*` defines at the top of the file — the SPI CLK and DIN pins (18, 23)
			
 
				+> are hardware SPI and cannot be freely remapped without switching to software SPI.
			
 
				+
			
 
				+## Power
			
 
				+
			
 
				+- Display: powered from the ESP32 3.3 V rail. Current draw during refresh is
			
 
				+  ~30 mA peak — within ESP32 rail limits for a single display.
			
 
				+- ESP32: power from USB (5 V). For permanent installation use a 5 V USB wall
			
 
				+  adapter rated ≥ 1 A.
			
 
				+
			
 
				+## Audio input
			
 
				+
			
 
				+| Source | Connection | Notes |
			
 
				+|---|---|---|
			
 
				+| Mixing desk | Line-out → USB audio interface → PC USB | Cleanest signal; recommended |
			
 
				+| Microphone | USB condenser → PC USB | Use if mixer feed not available |
			
 
				+
			
 
				+Whisper performs best with a clean, low-noise signal. A direct line feed from
			
 
				+the mixing desk eliminates room echo and background noise.
			
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -0,0 +1,121 @@
 
				+# Setup Guide
			
 
				+
			
 
				+## Prerequisites
			
 
				+
			
 
				+| Component | Version | Notes |
			
 
				+|---|---|---|
			
 
				+| Python | 3.11+ | Windows install from python.org |
			
 
				+| NVIDIA GPU driver | Latest | RTX series recommended |
			
 
				+| CUDA toolkit | 12.x | Required by faster-whisper |
			
 
				+| Mosquitto | 2.x | MQTT broker |
			
 
				+| WhisperLiveKit | Latest | `pip install whisperlivekit` |
			
 
				+| PlatformIO | Latest | Via VS Code extension |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 1 — Install Mosquitto (MQTT broker)
			
 
				+
			
 
				+Download from mosquitto.org and install with default settings.
			
 
				+Start the service:
			
 
				+
			
 
				+```
			
 
				+net start mosquitto
			
 
				+```
			
 
				+
			
 
				+Verify it's running:
			
 
				+
			
 
				+```
			
 
				+mosquitto_sub -h localhost -t "#" -v
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 2 — Install WhisperLiveKit
			
 
				+
			
 
				+```
			
 
				+pip install whisperlivekit
			
 
				+```
			
 
				+
			
 
				+Start the server with diarization enabled:
			
 
				+
			
 
				+```
			
 
				+wlk --model large-v3 --language en --diarization
			
 
				+```
			
 
				+
			
 
				+The first run downloads the model (~3 GB). The WebSocket will be available at
			
 
				+`ws://localhost:8000/asr`. Verify by opening `http://localhost:8000` in a browser.
			
 
				+
			
 
				+> **Latency note:** If `large-v3` is too slow on your GPU, try
			
 
				+> `--model distil-large-v3` for similar accuracy at lower latency.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 3 — Install the Python bridge
			
 
				+
			
 
				+```
			
 
				+cd bridge
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+Run it:
			
 
				+
			
 
				+```
			
 
				+python bridge.py
			
 
				+```
			
 
				+
			
 
				+A small window opens for assigning friendly names to auto-detected speakers
			
 
				+(SPEAKER_00, SPEAKER_01, …). The defaults (Pastor, Reader, Guest, Choir) are
			
 
				+applied immediately — edit them if your service has different roles.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 4 — Flash the ESP32
			
 
				+
			
 
				+1. Open the `esp32/` folder in VS Code with the PlatformIO extension installed.
			
 
				+2. Edit `src/main.cpp` — fill in your WiFi credentials and the PC's IP address:
			
 
				+
			
 
				+   ```cpp
			
 
				+   #define WIFI_SSID     "YourNetwork"
			
 
				+   #define WIFI_PASSWORD "YourPassword"
			
 
				+   #define MQTT_HOST     "192.168.1.100"   // run `ipconfig` on the PC to find this
			
 
				+   ```
			
 
				+
			
 
				+3. Select the correct environment in PlatformIO:
			
 
				+   - `esp32dev` for ESP32-WROOM-32
			
 
				+   - `esp32-s3` for ESP32-S3 (recommended for larger RAM)
			
 
				+
			
 
				+4. Click **Upload**. Open Serial Monitor at 115200 baud to see boot messages.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 5 — End-to-end test
			
 
				+
			
 
				+Run these checks in order:
			
 
				+
			
 
				+1. **Whisper standalone** — speak into the mic, verify text appears at
			
 
				+   `http://localhost:8000`.
			
 
				+
			
 
				+2. **MQTT manually** — with the ESP32 connected, publish a test message:
			
 
				+
			
 
				+   ```
			
 
				+   mosquitto_pub -h localhost -t display/text -m "{\"lines\":[\"Line one\",\"Line two\",\"Line three\"]}"
			
 
				+   ```
			
 
				+
			
 
				+   The display should refresh within ~2 seconds.
			
 
				+
			
 
				+3. **Full pipeline** — start the bridge, speak naturally. Text should appear on
			
 
				+   the display within 3–5 seconds of speech.
			
 
				+
			
 
				+4. **Speaker labels** — if two people speak alternately, `[PASTOR]` / `[READER]`
			
 
				+   labels should appear as speaker changes are detected.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 6 — Deployment checklist
			
 
				+
			
 
				+- [ ] PC set to never sleep during services
			
 
				+- [ ] Mosquitto service set to start automatically (`sc config mosquitto start=auto`)
			
 
				+- [ ] WhisperLiveKit added to Windows startup (Task Scheduler or a `.bat` file)
			
 
				+- [ ] ESP32 powered from a USB wall adapter (not PC USB, to avoid dependency)
			
 
				+- [ ] Static IP assigned to ESP32 in router DHCP settings
			
 
				+- [ ] Audio input confirmed — direct mixer feed preferred over microphone
			
--- a/esp32/platformio.ini
+++ b/esp32/platformio.ini
@@ -0,0 +1,26 @@
 
				+[platformio]
			
 
				+default_envs = esp32dev
			
 
				+
			
 
				+; ── ESP32-WROOM-32 (standard, 520 KB RAM) ────────────────────────────────────
			
 
				+[env:esp32dev]
			
 
				+platform      = espressif32
			
 
				+board         = esp32dev
			
 
				+framework     = arduino
			
 
				+monitor_speed = 115200
			
 
				+lib_deps =
			
 
				+    knolleary/PubSubClient @ ^2.8
			
 
				+    bblanchon/ArduinoJson @ ^7.0
			
 
				+    zinggjm/GxEPD2 @ ^1.5
			
 
				+    adafruit/Adafruit GFX Library @ ^1.11
			
 
				+
			
 
				+; ── ESP32-S3 (preferred — more RAM, good for large font bitmaps) ──────────────
			
 
				+[env:esp32-s3]
			
 
				+platform      = espressif32
			
 
				+board         = esp32-s3-devkitc-1
			
 
				+framework     = arduino
			
 
				+monitor_speed = 115200
			
 
				+lib_deps =
			
 
				+    knolleary/PubSubClient @ ^2.8
			
 
				+    bblanchon/ArduinoJson @ ^7.0
			
 
				+    zinggjm/GxEPD2 @ ^1.5
			
 
				+    adafruit/Adafruit GFX Library @ ^1.11
			
--- a/esp32/src/main.cpp
+++ b/esp32/src/main.cpp
@@ -0,0 +1,230 @@
 
				+/*
			
 
				+ * main.cpp — ESP32 e-ink Display Firmware
			
 
				+ *
			
 
				+ * Connects to WiFi + MQTT broker, subscribes to display/text and display/clear,
			
 
				+ * renders rolling 3-line text on a Waveshare 7.5" V2 (800x480) e-ink display.
			
 
				+ *
			
 
				+ * Required libraries (platformio.ini):
			
 
				+ *   knolleary/PubSubClient
			
 
				+ *   bblanchon/ArduinoJson
			
 
				+ *   zinggjm/GxEPD2
			
 
				+ *   adafruit/Adafruit GFX Library
			
 
				+ */
			
 
				+
			
 
				+#include <Arduino.h>
			
 
				+#include <WiFi.h>
			
 
				+#include <PubSubClient.h>
			
 
				+#include <ArduinoJson.h>
			
 
				+#include <GxEPD2_BW.h>
			
 
				+#include <Fonts/FreeSansBold24pt7b.h>
			
 
				+#include <Fonts/FreeSans9pt7b.h>
			
 
				+
			
 
				+// ── User config — edit before flashing ───────────────────────────────────────
			
 
				+
			
 
				+#define WIFI_SSID      "YOUR_WIFI_SSID"
			
 
				+#define WIFI_PASSWORD  "YOUR_WIFI_PASSWORD"
			
 
				+#define MQTT_HOST      "192.168.1.100"   // Windows PC IP running Mosquitto
			
 
				+#define MQTT_PORT      1883
			
 
				+#define DEVICE_ID      "display-01"
			
 
				+
			
 
				+// ── Display pins (Waveshare 7.5" V2 → ESP32) ─────────────────────────────────
			
 
				+// CLK → GPIO 18  (SPI SCLK, wired directly — managed by library)
			
 
				+// DIN → GPIO 23  (SPI MOSI, wired directly — managed by library)
			
 
				+
			
 
				+#define PIN_CS    5
			
 
				+#define PIN_DC   17
			
 
				+#define PIN_RST  16
			
 
				+#define PIN_BUSY  4
			
 
				+
			
 
				+// ── MQTT topics ───────────────────────────────────────────────────────────────
			
 
				+
			
 
				+#define TOPIC_TEXT   "display/text"
			
 
				+#define TOPIC_CLEAR  "display/clear"
			
 
				+#define TOPIC_STATUS "display/status"
			
 
				+
			
 
				+// ── Display layout ────────────────────────────────────────────────────────────
			
 
				+
			
 
				+#define DISPLAY_W       800
			
 
				+#define DISPLAY_H       480
			
 
				+#define MARGIN_X         12
			
 
				+#define LINE_1_BASELINE 110    // y baseline of first text line
			
 
				+#define LINE_SPACING    145    // pixels between baselines (generous for readability)
			
 
				+
			
 
				+// ── Display object ────────────────────────────────────────────────────────────
			
 
				+
			
 
				+GxEPD2_BW<GxEPD2_750_T7, GxEPD2_750_T7::HEIGHT> display(
			
 
				+    GxEPD2_750_T7(PIN_CS, PIN_DC, PIN_RST, PIN_BUSY)
			
 
				+);
			
 
				+
			
 
				+// ── MQTT + WiFi ───────────────────────────────────────────────────────────────
			
 
				+
			
 
				+WiFiClient   wifiClient;
			
 
				+PubSubClient mqtt(wifiClient);
			
 
				+
			
 
				+// ── Pending render state ──────────────────────────────────────────────────────
			
 
				+
			
 
				+static String pendingLines[3];
			
 
				+static bool   renderPending = false;
			
 
				+
			
 
				+// ── Display helpers ───────────────────────────────────────────────────────────
			
 
				+
			
 
				+void renderLines(const String lines[3]) {
			
 
				+    display.setFullWindow();
			
 
				+    display.firstPage();
			
 
				+    do {
			
 
				+        display.fillScreen(GxEPD_WHITE);
			
 
				+
			
 
				+        for (int i = 0; i < 3; i++) {
			
 
				+            if (lines[i].length() == 0) continue;
			
 
				+
			
 
				+            // Speaker label lines (e.g. "[PASTOR]") use a smaller italic font
			
 
				+            bool isLabel = lines[i].startsWith("[") && lines[i].endsWith("]");
			
 
				+
			
 
				+            if (isLabel) {
			
 
				+                display.setFont(&FreeSans9pt7b);
			
 
				+            } else {
			
 
				+                display.setFont(&FreeSansBold24pt7b);
			
 
				+            }
			
 
				+
			
 
				+            display.setTextColor(GxEPD_BLACK);
			
 
				+            display.setCursor(MARGIN_X, LINE_1_BASELINE + i * LINE_SPACING);
			
 
				+            display.print(lines[i]);
			
 
				+        }
			
 
				+    } while (display.nextPage());
			
 
				+
			
 
				+    Serial.println("[Display] Refreshed");
			
 
				+}
			
 
				+
			
 
				+void clearDisplay() {
			
 
				+    display.setFullWindow();
			
 
				+    display.firstPage();
			
 
				+    do {
			
 
				+        display.fillScreen(GxEPD_WHITE);
			
 
				+    } while (display.nextPage());
			
 
				+    Serial.println("[Display] Cleared");
			
 
				+}
			
 
				+
			
 
				+void showBootMessage() {
			
 
				+    String boot[3] = {"", "", "  DISPLAY READY"};
			
 
				+    renderLines(boot);
			
 
				+}
			
 
				+
			
 
				+// ── MQTT callback ─────────────────────────────────────────────────────────────
			
 
				+
			
 
				+void onMqttMessage(char* topic, byte* payload, unsigned int len) {
			
 
				+    String topicStr = String(topic);
			
 
				+
			
 
				+    if (topicStr == TOPIC_CLEAR) {
			
 
				+        for (int i = 0; i < 3; i++) pendingLines[i] = "";
			
 
				+        renderPending = true;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (topicStr == TOPIC_TEXT) {
			
 
				+        // Null-terminate the payload
			
 
				+        char buf[600];
			
 
				+        if (len >= sizeof(buf)) {
			
 
				+            Serial.println("[MQTT] Payload too large, skipped");
			
 
				+            return;
			
 
				+        }
			
 
				+        memcpy(buf, payload, len);
			
 
				+        buf[len] = '\0';
			
 
				+
			
 
				+        JsonDocument doc;
			
 
				+        DeserializationError err = deserializeJson(doc, buf);
			
 
				+        if (err) {
			
 
				+            Serial.printf("[MQTT] JSON error: %s\n", err.c_str());
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        JsonArray arr = doc["lines"].as<JsonArray>();
			
 
				+        if (!arr) return;
			
 
				+
			
 
				+        int i = 0;
			
 
				+        for (JsonVariant v : arr) {
			
 
				+            if (i >= 3) break;
			
 
				+            pendingLines[i++] = v.as<String>();
			
 
				+        }
			
 
				+        // Pad remaining lines
			
 
				+        for (; i < 3; i++) pendingLines[i] = "";
			
 
				+
			
 
				+        renderPending = true;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// ── WiFi ──────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+void connectWifi() {
			
 
				+    if (WiFi.status() == WL_CONNECTED) return;
			
 
				+
			
 
				+    Serial.printf("[WiFi] Connecting to %s", WIFI_SSID);
			
 
				+    WiFi.begin(WIFI_SSID, WIFI_PASSWORD);
			
 
				+
			
 
				+    unsigned long start = millis();
			
 
				+    while (WiFi.status() != WL_CONNECTED) {
			
 
				+        if (millis() - start > 30000) {
			
 
				+            Serial.println("\n[WiFi] Timeout — restarting");
			
 
				+            ESP.restart();
			
 
				+        }
			
 
				+        delay(500);
			
 
				+        Serial.print(".");
			
 
				+    }
			
 
				+    Serial.printf("\n[WiFi] Connected: %s\n", WiFi.localIP().toString().c_str());
			
 
				+}
			
 
				+
			
 
				+// ── MQTT ──────────────────────────────────────────────────────────────────────
			
 
				+
			
 
				+void connectMqtt() {
			
 
				+    while (!mqtt.connected()) {
			
 
				+        Serial.printf("[MQTT] Connecting to %s:%d ...\n", MQTT_HOST, MQTT_PORT);
			
 
				+
			
 
				+        if (mqtt.connect(DEVICE_ID)) {
			
 
				+            Serial.println("[MQTT] Connected");
			
 
				+            mqtt.subscribe(TOPIC_TEXT);
			
 
				+            mqtt.subscribe(TOPIC_CLEAR);
			
 
				+            mqtt.publish(TOPIC_STATUS, "{\"ready\":true}");
			
 
				+        } else {
			
 
				+            Serial.printf("[MQTT] Failed (state=%d), retry in 5 s\n", mqtt.state());
			
 
				+            delay(5000);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// ── Arduino setup & loop ──────────────────────────────────────────────────────
			
 
				+
			
 
				+void setup() {
			
 
				+    Serial.begin(115200);
			
 
				+    Serial.println("\n[Boot] Church Live Transcription Display");
			
 
				+
			
 
				+    display.init(115200);
			
 
				+    display.setRotation(0);
			
 
				+    clearDisplay();
			
 
				+    showBootMessage();
			
 
				+
			
 
				+    connectWifi();
			
 
				+
			
 
				+    mqtt.setServer(MQTT_HOST, MQTT_PORT);
			
 
				+    mqtt.setCallback(onMqttMessage);
			
 
				+    mqtt.setBufferSize(600);
			
 
				+    mqtt.setKeepAlive(60);
			
 
				+
			
 
				+    connectMqtt();
			
 
				+}
			
 
				+
			
 
				+void loop() {
			
 
				+    // Maintain connectivity
			
 
				+    if (WiFi.status() != WL_CONNECTED) {
			
 
				+        Serial.println("[WiFi] Lost — reconnecting...");
			
 
				+        connectWifi();
			
 
				+    }
			
 
				+    if (!mqtt.connected()) {
			
 
				+        connectMqtt();
			
 
				+    }
			
 
				+    mqtt.loop();
			
 
				+
			
 
				+    // Render outside of MQTT callback to avoid blocking the broker heartbeat
			
 
				+    if (renderPending) {
			
 
				+        renderPending = false;
			
 
				+        renderLines(pendingLines);
			
 
				+    }
			
 
				+}