"""Plugin: voice STT/TTS for Mumble channels. Listens for voice audio via pymumble's sound callback, buffers PCM per user, transcribes via Whisper STT on silence, and provides TTS playback via Piper. Commands: !listen, !say. """ from __future__ import annotations import asyncio import io import json import logging import threading import time import urllib.request import wave from derp.http import urlopen as _urlopen from derp.plugin import command log = logging.getLogger(__name__) # -- Constants --------------------------------------------------------------- _SAMPLE_RATE = 48000 _CHANNELS = 1 _SAMPLE_WIDTH = 2 # s16le = 2 bytes per sample _SILENCE_GAP = 1.5 # seconds of silence before flushing _MIN_DURATION = 0.5 # discard utterances shorter than this _MAX_DURATION = 30.0 # cap buffer at this many seconds _MIN_BYTES = int(_MIN_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH) _MAX_BYTES = int(_MAX_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH) _FLUSH_INTERVAL = 0.5 # flush monitor poll interval _MAX_SAY_LEN = 500 # max characters for !say _WHISPER_URL = "http://192.168.129.9:8080/inference" _PIPER_URL = "http://192.168.129.9:5100/" # -- Per-bot state ----------------------------------------------------------- def _ps(bot): """Per-bot plugin runtime state.""" cfg = getattr(bot, "config", {}).get("voice", {}) return bot._pstate.setdefault("voice", { "listen": False, "trigger": cfg.get("trigger", ""), "buffers": {}, # {username: bytearray} "last_ts": {}, # {username: float monotonic} "flush_task": None, "lock": threading.Lock(), "silence_gap": cfg.get("silence_gap", _SILENCE_GAP), "whisper_url": cfg.get("whisper_url", _WHISPER_URL), "piper_url": cfg.get("piper_url", _PIPER_URL), "_listener_registered": False, }) # -- Helpers ----------------------------------------------------------------- def _is_mumble(bot) -> bool: """Check if bot supports voice streaming.""" return hasattr(bot, "stream_audio") def _pcm_to_wav(pcm: bytes) -> bytes: """Wrap raw s16le 48kHz mono PCM in a WAV container.""" buf = io.BytesIO() with wave.open(buf, "wb") as wf: wf.setnchannels(_CHANNELS) wf.setsampwidth(_SAMPLE_WIDTH) wf.setframerate(_SAMPLE_RATE) wf.writeframes(pcm) return buf.getvalue() # -- STT: Sound listener (pymumble thread) ---------------------------------- def _on_voice(bot, user, sound_chunk): """Buffer incoming voice PCM per user. Runs on pymumble thread.""" ps = _ps(bot) if not ps["listen"] and not ps["trigger"]: return try: name = user["name"] except (KeyError, TypeError): name = None if not name or name == bot.nick: return pcm = sound_chunk.pcm if not pcm: return with ps["lock"]: if name not in ps["buffers"]: ps["buffers"][name] = bytearray() buf = ps["buffers"][name] buf.extend(pcm) if len(buf) > _MAX_BYTES: ps["buffers"][name] = bytearray(buf[-_MAX_BYTES:]) ps["last_ts"][name] = time.monotonic() # -- STT: Whisper transcription --------------------------------------------- def _transcribe(ps, pcm: bytes) -> str: """POST PCM (as WAV) to Whisper and return transcribed text. Blocking.""" wav_data = _pcm_to_wav(pcm) boundary = "----derp_voice_boundary" body = ( f"--{boundary}\r\n" f'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n' f"Content-Type: audio/wav\r\n\r\n" ).encode() + wav_data + ( f"\r\n--{boundary}\r\n" f'Content-Disposition: form-data; name="response_format"\r\n\r\n' f"json\r\n--{boundary}--\r\n" ).encode() req = urllib.request.Request(ps["whisper_url"], data=body, method="POST") req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}") resp = _urlopen(req, timeout=30, proxy=False) data = json.loads(resp.read()) resp.close() return data.get("text", "").strip() # -- STT: Flush monitor (asyncio background task) --------------------------- async def _flush_monitor(bot): """Poll for silence gaps and transcribe completed utterances.""" ps = _ps(bot) loop = asyncio.get_running_loop() try: while ps["listen"] or ps["trigger"]: await asyncio.sleep(_FLUSH_INTERVAL) now = time.monotonic() to_flush: list[tuple[str, bytes]] = [] with ps["lock"]: for name in list(ps["last_ts"]): elapsed = now - ps["last_ts"][name] if elapsed >= ps["silence_gap"] and name in ps["buffers"]: pcm = bytes(ps["buffers"].pop(name)) del ps["last_ts"][name] to_flush.append((name, pcm)) for name, pcm in to_flush: if len(pcm) < _MIN_BYTES: continue try: text = await loop.run_in_executor( None, _transcribe, ps, pcm, ) except Exception: log.exception("voice: transcription failed for %s", name) continue if not text or text.strip("., ") == "": continue trigger = ps["trigger"] if trigger and text.lower().startswith(trigger.lower()): remainder = text[len(trigger):].strip() if remainder: log.info("voice: trigger from %s: %s", name, remainder) bot._spawn( _tts_play(bot, remainder), name="voice-tts", ) continue if ps["listen"]: log.info("voice: %s said: %s", name, text) await bot.action("0", f"heard {name} say: {text}") except asyncio.CancelledError: pass except Exception: log.exception("voice: flush monitor error") # -- TTS: Piper fetch + playback -------------------------------------------- def _fetch_tts(piper_url: str, text: str) -> str | None: """POST text to Piper TTS and save the WAV response. Blocking.""" import tempfile try: payload = json.dumps({"text": text}).encode() req = urllib.request.Request( piper_url, data=payload, method="POST", ) req.add_header("Content-Type", "application/json") resp = _urlopen(req, timeout=30, proxy=False) data = resp.read() resp.close() if not data: return None tmp = tempfile.NamedTemporaryFile( suffix=".wav", prefix="derp_tts_", delete=False, ) tmp.write(data) tmp.close() return tmp.name except Exception: log.exception("voice: TTS fetch failed") return None async def _tts_play(bot, text: str): """Fetch TTS audio and play it via stream_audio.""" from pathlib import Path ps = _ps(bot) loop = asyncio.get_running_loop() wav_path = await loop.run_in_executor( None, _fetch_tts, ps["piper_url"], text, ) if wav_path is None: return try: done = asyncio.Event() await bot.stream_audio(str(wav_path), volume=1.0, on_done=done) await done.wait() finally: Path(wav_path).unlink(missing_ok=True) # -- Listener lifecycle ----------------------------------------------------- def _ensure_listener(bot): """Register the sound listener callback (idempotent).""" ps = _ps(bot) if ps["_listener_registered"]: return if not hasattr(bot, "_sound_listeners"): return bot._sound_listeners.append(lambda user, chunk: _on_voice(bot, user, chunk)) ps["_listener_registered"] = True log.info("voice: registered sound listener") def _ensure_flush_task(bot): """Start the flush monitor if not running.""" ps = _ps(bot) task = ps.get("flush_task") if task and not task.done(): return ps["flush_task"] = bot._spawn( _flush_monitor(bot), name="voice-flush-monitor", ) def _stop_flush_task(bot): """Cancel the flush monitor.""" ps = _ps(bot) task = ps.get("flush_task") if task and not task.done(): task.cancel() ps["flush_task"] = None # -- Commands ---------------------------------------------------------------- @command("listen", help="Voice: !listen [on|off] -- toggle STT", tier="admin") async def cmd_listen(bot, message): """Toggle voice-to-text transcription.""" if not _is_mumble(bot): await bot.reply(message, "Voice is Mumble-only") return ps = _ps(bot) parts = message.text.split() if len(parts) < 2: state = "on" if ps["listen"] else "off" trigger = ps["trigger"] info = f"Listen: {state}" if trigger: info += f" | Trigger: {trigger}" await bot.reply(message, info) return sub = parts[1].lower() if sub == "on": ps["listen"] = True _ensure_listener(bot) _ensure_flush_task(bot) await bot.reply(message, "Listening for voice") elif sub == "off": ps["listen"] = False if not ps["trigger"]: with ps["lock"]: ps["buffers"].clear() ps["last_ts"].clear() _stop_flush_task(bot) await bot.reply(message, "Stopped listening") else: await bot.reply(message, "Usage: !listen [on|off]") @command("say", help="Voice: !say -- text-to-speech") async def cmd_say(bot, message): """Speak text aloud via Piper TTS.""" if not _is_mumble(bot): await bot.reply(message, "Voice is Mumble-only") return parts = message.text.split(None, 1) if len(parts) < 2: await bot.reply(message, "Usage: !say ") return text = parts[1].strip() if len(text) > _MAX_SAY_LEN: await bot.reply(message, f"Text too long (max {_MAX_SAY_LEN} chars)") return bot._spawn(_tts_play(bot, text), name="voice-tts") # -- Plugin lifecycle -------------------------------------------------------- async def on_connected(bot) -> None: """Re-register listener after reconnect; play TTS greeting on first join.""" if not _is_mumble(bot): return ps = _ps(bot) # TTS greeting on first connect greet = bot.config.get("mumble", {}).get("greet") if greet and not ps.get("_greeted"): ps["_greeted"] = True # Wait for audio subsystem to be ready for _ in range(20): if bot._is_audio_ready(): break await asyncio.sleep(0.5) bot._spawn(_tts_play(bot, greet), name="voice-greet") if ps["listen"] or ps["trigger"]: _ensure_listener(bot) _ensure_flush_task(bot)