feat: add voice plugin with STT and TTS

Whisper STT: buffers incoming voice PCM per user, transcribes on silence gap via local whisper.cpp endpoint, posts results as actions. Piper TTS: !say fetches WAV from local Piper endpoint and plays via stream_audio(). 37 tests cover buffering, flush logic, transcription, WAV encoding, commands, and lifecycle.
2026-02-22 03:08:02 +01:00
parent 039f060b50
commit 9fbf45f67d
2 changed files with 843 additions and 0 deletions
@@ -0,0 +1,309 @@
+"""Plugin: voice STT/TTS for Mumble channels.
+
+Listens for voice audio via pymumble's sound callback, buffers PCM per
+user, transcribes via Whisper STT on silence, and provides TTS playback
+via Piper.  Commands: !listen, !say.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import io
+import json
+import logging
+import threading
+import time
+import wave
+from urllib.parse import quote_plus
+import urllib.request
+
+from derp.http import urlopen as _urlopen
+from derp.plugin import command
+
+log = logging.getLogger(__name__)
+
+# -- Constants ---------------------------------------------------------------
+
+_SAMPLE_RATE = 48000
+_CHANNELS = 1
+_SAMPLE_WIDTH = 2  # s16le = 2 bytes per sample
+
+_SILENCE_GAP = 1.5      # seconds of silence before flushing
+_MIN_DURATION = 0.5      # discard utterances shorter than this
+_MAX_DURATION = 30.0     # cap buffer at this many seconds
+_MIN_BYTES = int(_MIN_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
+_MAX_BYTES = int(_MAX_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
+_FLUSH_INTERVAL = 0.5    # flush monitor poll interval
+_MAX_SAY_LEN = 500       # max characters for !say
+
+_WHISPER_URL = "http://192.168.122.1:8080/inference"
+_PIPER_URL = "http://192.168.122.1:5000/"
+
+# -- Per-bot state -----------------------------------------------------------
+
+
+def _ps(bot):
+    """Per-bot plugin runtime state."""
+    cfg = getattr(bot, "config", {}).get("voice", {})
+    return bot._pstate.setdefault("voice", {
+        "listen": False,
+        "buffers": {},          # {username: bytearray}
+        "last_ts": {},          # {username: float monotonic}
+        "flush_task": None,
+        "lock": threading.Lock(),
+        "silence_gap": cfg.get("silence_gap", _SILENCE_GAP),
+        "whisper_url": cfg.get("whisper_url", _WHISPER_URL),
+        "piper_url": cfg.get("piper_url", _PIPER_URL),
+        "_listener_registered": False,
+    })
+
+
+# -- Helpers -----------------------------------------------------------------
+
+
+def _is_mumble(bot) -> bool:
+    """Check if bot supports voice streaming."""
+    return hasattr(bot, "stream_audio")
+
+
+def _pcm_to_wav(pcm: bytes) -> bytes:
+    """Wrap raw s16le 48kHz mono PCM in a WAV container."""
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(_CHANNELS)
+        wf.setsampwidth(_SAMPLE_WIDTH)
+        wf.setframerate(_SAMPLE_RATE)
+        wf.writeframes(pcm)
+    return buf.getvalue()
+
+
+# -- STT: Sound listener (pymumble thread) ----------------------------------
+
+
+def _on_voice(bot, user, sound_chunk):
+    """Buffer incoming voice PCM per user.  Runs on pymumble thread."""
+    ps = _ps(bot)
+    if not ps["listen"]:
+        return
+    name = user["name"] if isinstance(user, dict) else None
+    if not name or name == bot.nick:
+        return
+    pcm = sound_chunk.pcm
+    if not pcm:
+        return
+    with ps["lock"]:
+        if name not in ps["buffers"]:
+            ps["buffers"][name] = bytearray()
+        buf = ps["buffers"][name]
+        buf.extend(pcm)
+        if len(buf) > _MAX_BYTES:
+            ps["buffers"][name] = bytearray(buf[-_MAX_BYTES:])
+        ps["last_ts"][name] = time.monotonic()
+
+
+# -- STT: Whisper transcription ---------------------------------------------
+
+
+def _transcribe(ps, pcm: bytes) -> str:
+    """POST PCM (as WAV) to Whisper and return transcribed text.  Blocking."""
+    wav_data = _pcm_to_wav(pcm)
+    boundary = "----derp_voice_boundary"
+    body = (
+        f"--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n'
+        f"Content-Type: audio/wav\r\n\r\n"
+    ).encode() + wav_data + (
+        f"\r\n--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="response_format"\r\n\r\n'
+        f"json\r\n--{boundary}--\r\n"
+    ).encode()
+    req = urllib.request.Request(ps["whisper_url"], data=body, method="POST")
+    req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
+    resp = _urlopen(req, timeout=30, proxy=False)
+    data = json.loads(resp.read())
+    resp.close()
+    return data.get("text", "").strip()
+
+
+# -- STT: Flush monitor (asyncio background task) ---------------------------
+
+
+async def _flush_monitor(bot):
+    """Poll for silence gaps and transcribe completed utterances."""
+    ps = _ps(bot)
+    loop = asyncio.get_running_loop()
+    try:
+        while ps["listen"]:
+            await asyncio.sleep(_FLUSH_INTERVAL)
+            now = time.monotonic()
+            to_flush: list[tuple[str, bytes]] = []
+
+            with ps["lock"]:
+                for name in list(ps["last_ts"]):
+                    elapsed = now - ps["last_ts"][name]
+                    if elapsed >= ps["silence_gap"] and name in ps["buffers"]:
+                        pcm = bytes(ps["buffers"].pop(name))
+                        del ps["last_ts"][name]
+                        to_flush.append((name, pcm))
+
+            for name, pcm in to_flush:
+                if len(pcm) < _MIN_BYTES:
+                    continue
+                try:
+                    text = await loop.run_in_executor(
+                        None, _transcribe, ps, pcm,
+                    )
+                except Exception:
+                    log.exception("voice: transcription failed for %s", name)
+                    continue
+                if not text or text.strip("., ") == "":
+                    continue
+                log.info("voice: %s said: %s", name, text)
+                await bot.action("0", f"heard {name} say: {text}")
+    except asyncio.CancelledError:
+        pass
+    except Exception:
+        log.exception("voice: flush monitor error")
+
+
+# -- TTS: Piper fetch + playback --------------------------------------------
+
+
+def _fetch_tts(url: str) -> str | None:
+    """Fetch TTS WAV from Piper to a temp file.  Blocking."""
+    import tempfile
+    try:
+        resp = _urlopen(url, timeout=30, proxy=False)
+        data = resp.read()
+        resp.close()
+        if not data:
+            return None
+        tmp = tempfile.NamedTemporaryFile(
+            suffix=".wav", prefix="derp_tts_", delete=False,
+        )
+        tmp.write(data)
+        tmp.close()
+        return tmp.name
+    except Exception:
+        log.exception("voice: TTS fetch failed")
+        return None
+
+
+async def _tts_play(bot, text: str):
+    """Fetch TTS audio and play it via stream_audio."""
+    from pathlib import Path
+
+    ps = _ps(bot)
+    url = ps["piper_url"] + "?text=" + quote_plus(text)
+    loop = asyncio.get_running_loop()
+    wav_path = await loop.run_in_executor(None, _fetch_tts, url)
+    if wav_path is None:
+        return
+    try:
+        done = asyncio.Event()
+        await bot.stream_audio(str(wav_path), volume=1.0, on_done=done)
+        await done.wait()
+    finally:
+        Path(wav_path).unlink(missing_ok=True)
+
+
+# -- Listener lifecycle -----------------------------------------------------
+
+
+def _ensure_listener(bot):
+    """Register the sound listener callback (idempotent)."""
+    ps = _ps(bot)
+    if ps["_listener_registered"]:
+        return
+    if not hasattr(bot, "_sound_listeners"):
+        return
+    bot._sound_listeners.append(lambda user, chunk: _on_voice(bot, user, chunk))
+    ps["_listener_registered"] = True
+    log.info("voice: registered sound listener")
+
+
+def _ensure_flush_task(bot):
+    """Start the flush monitor if not running."""
+    ps = _ps(bot)
+    task = ps.get("flush_task")
+    if task and not task.done():
+        return
+    ps["flush_task"] = bot._spawn(
+        _flush_monitor(bot), name="voice-flush-monitor",
+    )
+
+
+def _stop_flush_task(bot):
+    """Cancel the flush monitor."""
+    ps = _ps(bot)
+    task = ps.get("flush_task")
+    if task and not task.done():
+        task.cancel()
+    ps["flush_task"] = None
+
+
+# -- Commands ----------------------------------------------------------------
+
+
+@command("listen", help="Voice: !listen [on|off] -- toggle STT", tier="admin")
+async def cmd_listen(bot, message):
+    """Toggle voice-to-text transcription."""
+    if not _is_mumble(bot):
+        await bot.reply(message, "Voice is Mumble-only")
+        return
+
+    ps = _ps(bot)
+    parts = message.text.split()
+    if len(parts) < 2:
+        state = "on" if ps["listen"] else "off"
+        await bot.reply(message, f"Listen: {state}")
+        return
+
+    sub = parts[1].lower()
+    if sub == "on":
+        ps["listen"] = True
+        _ensure_listener(bot)
+        _ensure_flush_task(bot)
+        await bot.reply(message, "Listening for voice")
+    elif sub == "off":
+        ps["listen"] = False
+        with ps["lock"]:
+            ps["buffers"].clear()
+            ps["last_ts"].clear()
+        _stop_flush_task(bot)
+        await bot.reply(message, "Stopped listening")
+    else:
+        await bot.reply(message, "Usage: !listen [on|off]")
+
+
+@command("say", help="Voice: !say <text> -- text-to-speech")
+async def cmd_say(bot, message):
+    """Speak text aloud via Piper TTS."""
+    if not _is_mumble(bot):
+        await bot.reply(message, "Voice is Mumble-only")
+        return
+
+    parts = message.text.split(None, 1)
+    if len(parts) < 2:
+        await bot.reply(message, "Usage: !say <text>")
+        return
+
+    text = parts[1].strip()
+    if len(text) > _MAX_SAY_LEN:
+        await bot.reply(message, f"Text too long (max {_MAX_SAY_LEN} chars)")
+        return
+
+    bot._spawn(_tts_play(bot, text), name="voice-tts")
+
+
+# -- Plugin lifecycle --------------------------------------------------------
+
+
+async def on_connected(bot) -> None:
+    """Re-register listener after reconnect if listen was on."""
+    if not _is_mumble(bot):
+        return
+    ps = _ps(bot)
+    if ps["listen"]:
+        _ensure_listener(bot)
+        _ensure_flush_task(bot)