"""Plugin: voice STT/TTS for Mumble channels. Listens for voice audio via pymumble's sound callback, buffers PCM per user, transcribes via Whisper STT on silence, and provides TTS playback via Piper. Commands: !listen, !say. """ from __future__ import annotations import asyncio import io import json import logging import math import struct import threading import time import urllib.request import wave from derp.http import urlopen as _urlopen from derp.plugin import command log = logging.getLogger(__name__) # -- Constants --------------------------------------------------------------- _SAMPLE_RATE = 48000 _CHANNELS = 1 _SAMPLE_WIDTH = 2 # s16le = 2 bytes per sample _SILENCE_GAP = 1.5 # seconds of silence before flushing _MIN_DURATION = 0.5 # discard utterances shorter than this _MAX_DURATION = 30.0 # cap buffer at this many seconds _MIN_BYTES = int(_MIN_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH) _MAX_BYTES = int(_MAX_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH) _FLUSH_INTERVAL = 0.5 # flush monitor poll interval _MAX_SAY_LEN = 500 # max characters for !say _WHISPER_URL = "http://192.168.129.9:8080/inference" _PIPER_URL = "http://192.168.129.9:5100/" # -- Per-bot state ----------------------------------------------------------- def _ps(bot): """Per-bot plugin runtime state.""" cfg = getattr(bot, "config", {}).get("voice", {}) trigger = cfg.get("trigger", "") # Bias Whisper toward the trigger word unless explicitly configured default_prompt = f"{trigger.capitalize()}, " if trigger else "" return bot._pstate.setdefault("voice", { "listen": False, "trigger": trigger, "buffers": {}, # {username: bytearray} "last_ts": {}, # {username: float monotonic} "flush_task": None, "lock": threading.Lock(), "silence_gap": cfg.get("silence_gap", _SILENCE_GAP), "whisper_url": cfg.get("whisper_url", _WHISPER_URL), "piper_url": cfg.get("piper_url", _PIPER_URL), "voice": cfg.get("voice", ""), "length_scale": cfg.get("length_scale", 1.0), "noise_scale": cfg.get("noise_scale", 0.667), "noise_w": cfg.get("noise_w", 0.8), "fx": cfg.get("fx", ""), "initial_prompt": cfg.get("initial_prompt", default_prompt), "_listener_registered": False, }) # -- Helpers ----------------------------------------------------------------- def _is_mumble(bot) -> bool: """Check if bot supports voice streaming.""" return hasattr(bot, "stream_audio") def _pcm_to_wav(pcm: bytes) -> bytes: """Wrap raw s16le 48kHz mono PCM in a WAV container.""" buf = io.BytesIO() with wave.open(buf, "wb") as wf: wf.setnchannels(_CHANNELS) wf.setsampwidth(_SAMPLE_WIDTH) wf.setframerate(_SAMPLE_RATE) wf.writeframes(pcm) return buf.getvalue() # -- Acknowledge tone -------------------------------------------------------- _ACK_FREQ = (880, 1320) # A5 -> E6 ascending _ACK_NOTE_DUR = 0.15 # seconds per note _ACK_AMP = 12000 # gentle amplitude _ACK_FRAME = 960 # 20ms at 48kHz, matches Mumble native async def _ack_tone(bot) -> None: """Play a short two-tone ascending chime via pymumble sound_output.""" mu = getattr(bot, "_mumble", None) if mu is None: return so = mu.sound_output if so is None: return # Unmute if self-muted (stream_audio handles re-mute later) if getattr(bot, "_self_mute", False): if bot._mute_task and not bot._mute_task.done(): bot._mute_task.cancel() bot._mute_task = None try: mu.users.myself.unmute() except Exception: pass frames_per_note = int(_ACK_NOTE_DUR / 0.02) # 0.02s per frame for freq in _ACK_FREQ: for i in range(frames_per_note): samples = [] for j in range(_ACK_FRAME): t = (i * _ACK_FRAME + j) / _SAMPLE_RATE samples.append(int(_ACK_AMP * math.sin(2 * math.pi * freq * t))) pcm = struct.pack(f"<{_ACK_FRAME}h", *samples) so.add_sound(pcm) while so.get_buffer_size() > 0.5: await asyncio.sleep(0.02) # Wait for tone to finish while so.get_buffer_size() > 0: await asyncio.sleep(0.05) # -- STT: Sound listener (pymumble thread) ---------------------------------- def _on_voice(bot, user, sound_chunk): """Buffer incoming voice PCM per user. Runs on pymumble thread.""" ps = _ps(bot) if not ps["listen"] and not ps["trigger"]: return try: name = user["name"] except (KeyError, TypeError): name = None if not name or name == bot.nick: return pcm = sound_chunk.pcm if not pcm: return with ps["lock"]: if name not in ps["buffers"]: ps["buffers"][name] = bytearray() buf = ps["buffers"][name] buf.extend(pcm) if len(buf) > _MAX_BYTES: ps["buffers"][name] = bytearray(buf[-_MAX_BYTES:]) ps["last_ts"][name] = time.monotonic() # -- STT: Whisper transcription --------------------------------------------- def _transcribe(ps, pcm: bytes) -> str: """POST PCM (as WAV) to Whisper and return transcribed text. Blocking.""" wav_data = _pcm_to_wav(pcm) boundary = "----derp_voice_boundary" body = ( f"--{boundary}\r\n" f'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n' f"Content-Type: audio/wav\r\n\r\n" ).encode() + wav_data + ( f"\r\n--{boundary}\r\n" f'Content-Disposition: form-data; name="response_format"\r\n\r\n' f"json" ).encode() # Bias Whisper toward the trigger word when configured prompt = ps.get("initial_prompt", "") if prompt: body += ( f"\r\n--{boundary}\r\n" f'Content-Disposition: form-data; name="initial_prompt"\r\n\r\n' f"{prompt}" ).encode() body += f"\r\n--{boundary}--\r\n".encode() req = urllib.request.Request(ps["whisper_url"], data=body, method="POST") req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}") resp = _urlopen(req, timeout=30, proxy=False) data = json.loads(resp.read()) resp.close() return data.get("text", "").strip() # -- STT: Flush monitor (asyncio background task) --------------------------- async def _flush_monitor(bot): """Poll for silence gaps and transcribe completed utterances.""" ps = _ps(bot) loop = asyncio.get_running_loop() try: while ps["listen"] or ps["trigger"]: await asyncio.sleep(_FLUSH_INTERVAL) now = time.monotonic() to_flush: list[tuple[str, bytes]] = [] with ps["lock"]: for name in list(ps["last_ts"]): elapsed = now - ps["last_ts"][name] if elapsed >= ps["silence_gap"] and name in ps["buffers"]: pcm = bytes(ps["buffers"].pop(name)) del ps["last_ts"][name] to_flush.append((name, pcm)) for name, pcm in to_flush: if len(pcm) < _MIN_BYTES: continue try: text = await loop.run_in_executor( None, _transcribe, ps, pcm, ) except Exception: log.exception("voice: transcription failed for %s", name) continue if not text or text.strip("., ") == "": continue trigger = ps["trigger"] if trigger and text.lower().startswith(trigger.lower()): remainder = text[len(trigger):].strip().lstrip(",.;:!?") if remainder: log.info("voice: trigger from %s: %s", name, remainder) bot._spawn( _tts_play(bot, remainder), name="voice-tts", ) continue if ps["listen"]: log.info("voice: %s said: %s", name, text) await bot.action("0", f"heard {name} say: {text}") except asyncio.CancelledError: pass except Exception: log.exception("voice: flush monitor error") # -- TTS: Piper fetch + playback -------------------------------------------- def _fetch_tts(piper_url: str, text: str) -> str | None: """POST text to Piper TTS and save the WAV response. Blocking.""" import tempfile try: payload = json.dumps({"text": text}).encode() req = urllib.request.Request( piper_url, data=payload, method="POST", ) req.add_header("Content-Type", "application/json") resp = _urlopen(req, timeout=30, proxy=False) data = resp.read() resp.close() if not data: return None tmp = tempfile.NamedTemporaryFile( suffix=".wav", prefix="derp_tts_", delete=False, ) tmp.write(data) tmp.close() return tmp.name except Exception: log.exception("voice: TTS fetch failed") return None async def _tts_play(bot, text: str): """Fetch TTS audio and play it via stream_audio. Uses the configured voice profile (voice, fx, piper params) when set, otherwise falls back to Piper's default voice. """ from pathlib import Path ps = _ps(bot) loop = asyncio.get_running_loop() if ps["voice"] or ps["fx"]: wav_path = await loop.run_in_executor( None, lambda: _fetch_tts_voice( ps["piper_url"], text, voice=ps["voice"], length_scale=ps["length_scale"], noise_scale=ps["noise_scale"], noise_w=ps["noise_w"], fx=ps["fx"], ), ) else: wav_path = await loop.run_in_executor( None, _fetch_tts, ps["piper_url"], text, ) if wav_path is None: return try: # Signal music plugin to duck, wait for it to take effect bot.registry._tts_active = True await asyncio.sleep(1.5) await _ack_tone(bot) done = asyncio.Event() await bot.stream_audio(str(wav_path), volume=1.0, on_done=done) await done.wait() finally: bot.registry._tts_active = False Path(wav_path).unlink(missing_ok=True) # -- Listener lifecycle ----------------------------------------------------- def _ensure_listener(bot): """Register the sound listener callback (idempotent).""" ps = _ps(bot) if ps["_listener_registered"]: return if not hasattr(bot, "_sound_listeners"): return bot._sound_listeners.append(lambda user, chunk: _on_voice(bot, user, chunk)) ps["_listener_registered"] = True log.info("voice: registered sound listener") def _ensure_flush_task(bot): """Start the flush monitor if not running.""" ps = _ps(bot) task = ps.get("flush_task") if task and not task.done(): return ps["flush_task"] = bot._spawn( _flush_monitor(bot), name="voice-flush-monitor", ) def _stop_flush_task(bot): """Cancel the flush monitor.""" ps = _ps(bot) task = ps.get("flush_task") if task and not task.done(): task.cancel() ps["flush_task"] = None # -- Commands ---------------------------------------------------------------- @command("listen", help="Voice: !listen [on|off] -- toggle STT", tier="admin") async def cmd_listen(bot, message): """Toggle voice-to-text transcription.""" if not _is_mumble(bot): await bot.reply(message, "Voice is Mumble-only") return ps = _ps(bot) parts = message.text.split() if len(parts) < 2: state = "on" if ps["listen"] else "off" trigger = ps["trigger"] info = f"Listen: {state}" if trigger: info += f" | Trigger: {trigger}" await bot.reply(message, info) return sub = parts[1].lower() if sub == "on": ps["listen"] = True _ensure_listener(bot) _ensure_flush_task(bot) await bot.reply(message, "Listening for voice") elif sub == "off": ps["listen"] = False if not ps["trigger"]: with ps["lock"]: ps["buffers"].clear() ps["last_ts"].clear() _stop_flush_task(bot) await bot.reply(message, "Stopped listening") else: await bot.reply(message, "Usage: !listen [on|off]") @command("say", help="Voice: !say -- text-to-speech") async def cmd_say(bot, message): """Speak text aloud via Piper TTS.""" if not _is_mumble(bot): await bot.reply(message, "Voice is Mumble-only") return parts = message.text.split(None, 1) if len(parts) < 2: await bot.reply(message, "Usage: !say ") return text = parts[1].strip() if len(text) > _MAX_SAY_LEN: await bot.reply(message, f"Text too long (max {_MAX_SAY_LEN} chars)") return bot._spawn(_tts_play(bot, text), name="voice-tts") def _split_fx(fx: str) -> tuple[list[str], str]: """Split FX chain into rubberband CLI args and ffmpeg filter string. Alpine's ffmpeg lacks librubberband, so pitch shifting is handled by the ``rubberband`` CLI tool and remaining filters by ffmpeg. """ import math parts = fx.split(",") rb_args: list[str] = [] ff_parts: list[str] = [] for part in parts: if part.startswith("rubberband="): opts: dict[str, str] = {} for kv in part[len("rubberband="):].split(":"): k, _, v = kv.partition("=") opts[k] = v if "pitch" in opts: semitones = 12 * math.log2(float(opts["pitch"])) rb_args += ["--pitch", f"{semitones:.2f}"] if opts.get("formant") == "1": rb_args.append("--formant") else: ff_parts.append(part) return rb_args, ",".join(ff_parts) def _fetch_tts_voice(piper_url: str, text: str, *, voice: str = "", speaker_id: int = 0, length_scale: float = 1.0, noise_scale: float = 0.667, noise_w: float = 0.8, fx: str = "") -> str | None: """Fetch TTS with explicit voice params and optional FX. Blocking. Pitch shifting uses the ``rubberband`` CLI (Alpine ffmpeg has no librubberband); remaining audio filters go through ffmpeg. """ import os import subprocess import tempfile payload = {"text": text} if voice: payload["voice"] = voice if speaker_id: payload["speaker_id"] = speaker_id payload["length_scale"] = length_scale payload["noise_scale"] = noise_scale payload["noise_w"] = noise_w data = json.dumps(payload).encode() req = urllib.request.Request(piper_url, data=data, method="POST") req.add_header("Content-Type", "application/json") resp = _urlopen(req, timeout=30, proxy=False) wav_data = resp.read() resp.close() if not wav_data: return None tmp = tempfile.NamedTemporaryFile(suffix=".wav", prefix="derp_aud_", delete=False) tmp.write(wav_data) tmp.close() if not fx: return tmp.name rb_args, ff_filters = _split_fx(fx) current = tmp.name # Pitch shift via rubberband CLI if rb_args: rb_out = tempfile.NamedTemporaryFile( suffix=".wav", prefix="derp_aud_", delete=False, ) rb_out.close() r = subprocess.run( ["rubberband"] + rb_args + [current, rb_out.name], capture_output=True, timeout=15, ) os.unlink(current) if r.returncode != 0: log.warning("voice: rubberband failed: %s", r.stderr[:200]) os.unlink(rb_out.name) return None current = rb_out.name # Remaining filters via ffmpeg if ff_filters: ff_out = tempfile.NamedTemporaryFile( suffix=".wav", prefix="derp_aud_", delete=False, ) ff_out.close() r = subprocess.run( ["ffmpeg", "-y", "-i", current, "-af", ff_filters, ff_out.name], capture_output=True, timeout=15, ) os.unlink(current) if r.returncode != 0: log.warning("voice: ffmpeg failed: %s", r.stderr[:200]) os.unlink(ff_out.name) return None current = ff_out.name return current @command("audition", help="Voice: !audition -- play voice samples", tier="admin") async def cmd_audition(bot, message): """Play voice samples through Mumble for comparison.""" if not _is_mumble(bot): return ps = _ps(bot) piper_url = ps["piper_url"] phrase = "The sorcerer has arrived. I have seen things beyond your understanding." # FX building blocks _deep = "rubberband=pitch=0.87:formant=1" _bass = "bass=g=6:f=110:w=0.6" _bass_heavy = "equalizer=f=80:t=h:w=150:g=8" _echo_subtle = "aecho=0.8:0.6:25|40:0.25|0.15" _echo_chamber = "aecho=0.8:0.88:60:0.35" _echo_cave = "aecho=0.8:0.7:40|70|100:0.3|0.2|0.1" samples = [ # -- Base voices (no FX) for reference ("ryan-high raw", "en_US-ryan-high", 0, ""), ("lessac-high raw", "en_US-lessac-high", 0, ""), # -- Deep pitch only ("ryan deep", "en_US-ryan-high", 0, _deep), ("lessac deep", "en_US-lessac-high", 0, _deep), # -- Deep + bass boost ("ryan deep+bass", "en_US-ryan-high", 0, f"{_deep},{_bass}"), ("lessac deep+bass", "en_US-lessac-high", 0, f"{_deep},{_bass}"), # -- Deep + heavy bass ("ryan deep+heavy bass", "en_US-ryan-high", 0, f"{_deep},{_bass_heavy}"), # -- Deep + bass + subtle echo ("ryan deep+bass+echo", "en_US-ryan-high", 0, f"{_deep},{_bass},{_echo_subtle}"), ("lessac deep+bass+echo", "en_US-lessac-high", 0, f"{_deep},{_bass},{_echo_subtle}"), # -- Deep + bass + chamber reverb ("ryan deep+bass+chamber", "en_US-ryan-high", 0, f"{_deep},{_bass},{_echo_chamber}"), ("lessac deep+bass+chamber", "en_US-lessac-high", 0, f"{_deep},{_bass},{_echo_chamber}"), # -- Deep + heavy bass + cave reverb ("ryan deep+heavybass+cave", "en_US-ryan-high", 0, f"{_deep},{_bass_heavy},{_echo_cave}"), # -- Libritts best candidates with full sorcerer chain ("libritts #20 deep+bass+echo", "en_US-libritts_r-medium", 20, f"{_deep},{_bass},{_echo_subtle}"), ("libritts #22 deep+bass+echo", "en_US-libritts_r-medium", 22, f"{_deep},{_bass},{_echo_subtle}"), ("libritts #79 deep+bass+chamber", "en_US-libritts_r-medium", 79, f"{_deep},{_bass},{_echo_chamber}"), ] await bot.reply(message, f"Auditioning {len(samples)} voice samples...") loop = asyncio.get_running_loop() from pathlib import Path for i, (label, voice, sid, fx) in enumerate(samples, 1): await bot.send("0", f"[{i}/{len(samples)}] {label}") await asyncio.sleep(1) sample_wav = await loop.run_in_executor( None, lambda v=voice, s=sid, f=fx: _fetch_tts_voice( piper_url, phrase, voice=v, speaker_id=s, length_scale=1.15, noise_scale=0.4, noise_w=0.5, fx=f, ), ) if sample_wav is None: await bot.send("0", " (failed)") continue try: done = asyncio.Event() await bot.stream_audio(sample_wav, volume=1.0, on_done=done) await done.wait() finally: Path(sample_wav).unlink(missing_ok=True) await asyncio.sleep(2) await bot.send("0", "Audition complete.") # -- Plugin lifecycle -------------------------------------------------------- async def on_connected(bot) -> None: """Re-register listener after reconnect; play TTS greeting on first connect.""" if not _is_mumble(bot): return ps = _ps(bot) if ps["listen"] or ps["trigger"]: _ensure_listener(bot) _ensure_flush_task(bot) # Greet via TTS on first connection only greet = getattr(bot, "config", {}).get("mumble", {}).get("greet") if greet and not ps.get("_greeted"): ps["_greeted"] = True ready = getattr(bot, "_is_audio_ready", None) if ready: for _ in range(20): if ready(): break await asyncio.sleep(0.5) bot._spawn(_tts_play(bot, greet), name="voice-greet")