Files
derp/plugins/voice.py
user eded764f6a fix: update Piper TTS endpoint and request format
Piper is on 192.168.129.9:5100, not :5000. It expects POST with
JSON body {"text": "..."}, not GET with query params. Also update
Whisper default to 192.168.129.9.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 04:56:24 +01:00

348 lines
11 KiB
Python

"""Plugin: voice STT/TTS for Mumble channels.
Listens for voice audio via pymumble's sound callback, buffers PCM per
user, transcribes via Whisper STT on silence, and provides TTS playback
via Piper. Commands: !listen, !say.
"""
from __future__ import annotations
import asyncio
import io
import json
import logging
import threading
import time
import wave
import urllib.request
from derp.http import urlopen as _urlopen
from derp.plugin import command
log = logging.getLogger(__name__)
# -- Constants ---------------------------------------------------------------
_SAMPLE_RATE = 48000
_CHANNELS = 1
_SAMPLE_WIDTH = 2 # s16le = 2 bytes per sample
_SILENCE_GAP = 1.5 # seconds of silence before flushing
_MIN_DURATION = 0.5 # discard utterances shorter than this
_MAX_DURATION = 30.0 # cap buffer at this many seconds
_MIN_BYTES = int(_MIN_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
_MAX_BYTES = int(_MAX_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
_FLUSH_INTERVAL = 0.5 # flush monitor poll interval
_MAX_SAY_LEN = 500 # max characters for !say
_WHISPER_URL = "http://192.168.129.9:8080/inference"
_PIPER_URL = "http://192.168.129.9:5100/"
# -- Per-bot state -----------------------------------------------------------
def _ps(bot):
"""Per-bot plugin runtime state."""
cfg = getattr(bot, "config", {}).get("voice", {})
return bot._pstate.setdefault("voice", {
"listen": False,
"trigger": cfg.get("trigger", ""),
"buffers": {}, # {username: bytearray}
"last_ts": {}, # {username: float monotonic}
"flush_task": None,
"lock": threading.Lock(),
"silence_gap": cfg.get("silence_gap", _SILENCE_GAP),
"whisper_url": cfg.get("whisper_url", _WHISPER_URL),
"piper_url": cfg.get("piper_url", _PIPER_URL),
"_listener_registered": False,
})
# -- Helpers -----------------------------------------------------------------
def _is_mumble(bot) -> bool:
"""Check if bot supports voice streaming."""
return hasattr(bot, "stream_audio")
def _pcm_to_wav(pcm: bytes) -> bytes:
"""Wrap raw s16le 48kHz mono PCM in a WAV container."""
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
wf.setnchannels(_CHANNELS)
wf.setsampwidth(_SAMPLE_WIDTH)
wf.setframerate(_SAMPLE_RATE)
wf.writeframes(pcm)
return buf.getvalue()
# -- STT: Sound listener (pymumble thread) ----------------------------------
def _on_voice(bot, user, sound_chunk):
"""Buffer incoming voice PCM per user. Runs on pymumble thread."""
ps = _ps(bot)
if not ps["listen"] and not ps["trigger"]:
return
try:
name = user["name"]
except (KeyError, TypeError):
name = None
if not name or name == bot.nick:
return
pcm = sound_chunk.pcm
if not pcm:
return
with ps["lock"]:
if name not in ps["buffers"]:
ps["buffers"][name] = bytearray()
buf = ps["buffers"][name]
buf.extend(pcm)
if len(buf) > _MAX_BYTES:
ps["buffers"][name] = bytearray(buf[-_MAX_BYTES:])
ps["last_ts"][name] = time.monotonic()
# -- STT: Whisper transcription ---------------------------------------------
def _transcribe(ps, pcm: bytes) -> str:
"""POST PCM (as WAV) to Whisper and return transcribed text. Blocking."""
wav_data = _pcm_to_wav(pcm)
boundary = "----derp_voice_boundary"
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n'
f"Content-Type: audio/wav\r\n\r\n"
).encode() + wav_data + (
f"\r\n--{boundary}\r\n"
f'Content-Disposition: form-data; name="response_format"\r\n\r\n'
f"json\r\n--{boundary}--\r\n"
).encode()
req = urllib.request.Request(ps["whisper_url"], data=body, method="POST")
req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
resp = _urlopen(req, timeout=30, proxy=False)
data = json.loads(resp.read())
resp.close()
return data.get("text", "").strip()
# -- STT: Flush monitor (asyncio background task) ---------------------------
async def _flush_monitor(bot):
"""Poll for silence gaps and transcribe completed utterances."""
ps = _ps(bot)
loop = asyncio.get_running_loop()
try:
while ps["listen"] or ps["trigger"]:
await asyncio.sleep(_FLUSH_INTERVAL)
now = time.monotonic()
to_flush: list[tuple[str, bytes]] = []
with ps["lock"]:
for name in list(ps["last_ts"]):
elapsed = now - ps["last_ts"][name]
if elapsed >= ps["silence_gap"] and name in ps["buffers"]:
pcm = bytes(ps["buffers"].pop(name))
del ps["last_ts"][name]
to_flush.append((name, pcm))
for name, pcm in to_flush:
if len(pcm) < _MIN_BYTES:
continue
try:
text = await loop.run_in_executor(
None, _transcribe, ps, pcm,
)
except Exception:
log.exception("voice: transcription failed for %s", name)
continue
if not text or text.strip("., ") == "":
continue
trigger = ps["trigger"]
if trigger and text.lower().startswith(trigger.lower()):
remainder = text[len(trigger):].strip()
if remainder:
log.info("voice: trigger from %s: %s", name, remainder)
bot._spawn(
_tts_play(bot, remainder), name="voice-tts",
)
continue
if ps["listen"]:
log.info("voice: %s said: %s", name, text)
await bot.action("0", f"heard {name} say: {text}")
except asyncio.CancelledError:
pass
except Exception:
log.exception("voice: flush monitor error")
# -- TTS: Piper fetch + playback --------------------------------------------
def _fetch_tts(piper_url: str, text: str) -> str | None:
"""POST text to Piper TTS and save the WAV response. Blocking."""
import tempfile
try:
payload = json.dumps({"text": text}).encode()
req = urllib.request.Request(
piper_url, data=payload, method="POST",
)
req.add_header("Content-Type", "application/json")
resp = _urlopen(req, timeout=30, proxy=False)
data = resp.read()
resp.close()
if not data:
return None
tmp = tempfile.NamedTemporaryFile(
suffix=".wav", prefix="derp_tts_", delete=False,
)
tmp.write(data)
tmp.close()
return tmp.name
except Exception:
log.exception("voice: TTS fetch failed")
return None
async def _tts_play(bot, text: str):
"""Fetch TTS audio and play it via stream_audio."""
from pathlib import Path
ps = _ps(bot)
loop = asyncio.get_running_loop()
wav_path = await loop.run_in_executor(
None, _fetch_tts, ps["piper_url"], text,
)
if wav_path is None:
return
try:
done = asyncio.Event()
await bot.stream_audio(str(wav_path), volume=1.0, on_done=done)
await done.wait()
finally:
Path(wav_path).unlink(missing_ok=True)
# -- Listener lifecycle -----------------------------------------------------
def _ensure_listener(bot):
"""Register the sound listener callback (idempotent)."""
ps = _ps(bot)
if ps["_listener_registered"]:
return
if not hasattr(bot, "_sound_listeners"):
return
bot._sound_listeners.append(lambda user, chunk: _on_voice(bot, user, chunk))
ps["_listener_registered"] = True
log.info("voice: registered sound listener")
def _ensure_flush_task(bot):
"""Start the flush monitor if not running."""
ps = _ps(bot)
task = ps.get("flush_task")
if task and not task.done():
return
ps["flush_task"] = bot._spawn(
_flush_monitor(bot), name="voice-flush-monitor",
)
def _stop_flush_task(bot):
"""Cancel the flush monitor."""
ps = _ps(bot)
task = ps.get("flush_task")
if task and not task.done():
task.cancel()
ps["flush_task"] = None
# -- Commands ----------------------------------------------------------------
@command("listen", help="Voice: !listen [on|off] -- toggle STT", tier="admin")
async def cmd_listen(bot, message):
"""Toggle voice-to-text transcription."""
if not _is_mumble(bot):
await bot.reply(message, "Voice is Mumble-only")
return
ps = _ps(bot)
parts = message.text.split()
if len(parts) < 2:
state = "on" if ps["listen"] else "off"
trigger = ps["trigger"]
info = f"Listen: {state}"
if trigger:
info += f" | Trigger: {trigger}"
await bot.reply(message, info)
return
sub = parts[1].lower()
if sub == "on":
ps["listen"] = True
_ensure_listener(bot)
_ensure_flush_task(bot)
await bot.reply(message, "Listening for voice")
elif sub == "off":
ps["listen"] = False
if not ps["trigger"]:
with ps["lock"]:
ps["buffers"].clear()
ps["last_ts"].clear()
_stop_flush_task(bot)
await bot.reply(message, "Stopped listening")
else:
await bot.reply(message, "Usage: !listen [on|off]")
@command("say", help="Voice: !say <text> -- text-to-speech")
async def cmd_say(bot, message):
"""Speak text aloud via Piper TTS."""
if not _is_mumble(bot):
await bot.reply(message, "Voice is Mumble-only")
return
parts = message.text.split(None, 1)
if len(parts) < 2:
await bot.reply(message, "Usage: !say <text>")
return
text = parts[1].strip()
if len(text) > _MAX_SAY_LEN:
await bot.reply(message, f"Text too long (max {_MAX_SAY_LEN} chars)")
return
bot._spawn(_tts_play(bot, text), name="voice-tts")
# -- Plugin lifecycle --------------------------------------------------------
async def on_connected(bot) -> None:
"""Re-register listener after reconnect; play TTS greeting on first join."""
if not _is_mumble(bot):
return
ps = _ps(bot)
# TTS greeting on first connect
greet = bot.config.get("mumble", {}).get("greet")
if greet and not ps.get("_greeted"):
ps["_greeted"] = True
# Wait for audio subsystem to be ready
for _ in range(20):
if bot._is_audio_ready():
break
await asyncio.sleep(0.5)
bot._spawn(_tts_play(bot, greet), name="voice-greet")
if ps["listen"] or ps["trigger"]:
_ensure_listener(bot)
_ensure_flush_task(bot)