feat: add voice plugin with STT and TTS
Whisper STT: buffers incoming voice PCM per user, transcribes on silence gap via local whisper.cpp endpoint, posts results as actions. Piper TTS: !say fetches WAV from local Piper endpoint and plays via stream_audio(). 37 tests cover buffering, flush logic, transcription, WAV encoding, commands, and lifecycle.
This commit is contained in:
309
plugins/voice.py
Normal file
309
plugins/voice.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""Plugin: voice STT/TTS for Mumble channels.
|
||||
|
||||
Listens for voice audio via pymumble's sound callback, buffers PCM per
|
||||
user, transcribes via Whisper STT on silence, and provides TTS playback
|
||||
via Piper. Commands: !listen, !say.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import wave
|
||||
from urllib.parse import quote_plus
|
||||
import urllib.request
|
||||
|
||||
from derp.http import urlopen as _urlopen
|
||||
from derp.plugin import command
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# -- Constants ---------------------------------------------------------------
|
||||
|
||||
_SAMPLE_RATE = 48000
|
||||
_CHANNELS = 1
|
||||
_SAMPLE_WIDTH = 2 # s16le = 2 bytes per sample
|
||||
|
||||
_SILENCE_GAP = 1.5 # seconds of silence before flushing
|
||||
_MIN_DURATION = 0.5 # discard utterances shorter than this
|
||||
_MAX_DURATION = 30.0 # cap buffer at this many seconds
|
||||
_MIN_BYTES = int(_MIN_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
|
||||
_MAX_BYTES = int(_MAX_DURATION * _SAMPLE_RATE * _SAMPLE_WIDTH)
|
||||
_FLUSH_INTERVAL = 0.5 # flush monitor poll interval
|
||||
_MAX_SAY_LEN = 500 # max characters for !say
|
||||
|
||||
_WHISPER_URL = "http://192.168.122.1:8080/inference"
|
||||
_PIPER_URL = "http://192.168.122.1:5000/"
|
||||
|
||||
# -- Per-bot state -----------------------------------------------------------
|
||||
|
||||
|
||||
def _ps(bot):
|
||||
"""Per-bot plugin runtime state."""
|
||||
cfg = getattr(bot, "config", {}).get("voice", {})
|
||||
return bot._pstate.setdefault("voice", {
|
||||
"listen": False,
|
||||
"buffers": {}, # {username: bytearray}
|
||||
"last_ts": {}, # {username: float monotonic}
|
||||
"flush_task": None,
|
||||
"lock": threading.Lock(),
|
||||
"silence_gap": cfg.get("silence_gap", _SILENCE_GAP),
|
||||
"whisper_url": cfg.get("whisper_url", _WHISPER_URL),
|
||||
"piper_url": cfg.get("piper_url", _PIPER_URL),
|
||||
"_listener_registered": False,
|
||||
})
|
||||
|
||||
|
||||
# -- Helpers -----------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_mumble(bot) -> bool:
|
||||
"""Check if bot supports voice streaming."""
|
||||
return hasattr(bot, "stream_audio")
|
||||
|
||||
|
||||
def _pcm_to_wav(pcm: bytes) -> bytes:
|
||||
"""Wrap raw s16le 48kHz mono PCM in a WAV container."""
|
||||
buf = io.BytesIO()
|
||||
with wave.open(buf, "wb") as wf:
|
||||
wf.setnchannels(_CHANNELS)
|
||||
wf.setsampwidth(_SAMPLE_WIDTH)
|
||||
wf.setframerate(_SAMPLE_RATE)
|
||||
wf.writeframes(pcm)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
# -- STT: Sound listener (pymumble thread) ----------------------------------
|
||||
|
||||
|
||||
def _on_voice(bot, user, sound_chunk):
|
||||
"""Buffer incoming voice PCM per user. Runs on pymumble thread."""
|
||||
ps = _ps(bot)
|
||||
if not ps["listen"]:
|
||||
return
|
||||
name = user["name"] if isinstance(user, dict) else None
|
||||
if not name or name == bot.nick:
|
||||
return
|
||||
pcm = sound_chunk.pcm
|
||||
if not pcm:
|
||||
return
|
||||
with ps["lock"]:
|
||||
if name not in ps["buffers"]:
|
||||
ps["buffers"][name] = bytearray()
|
||||
buf = ps["buffers"][name]
|
||||
buf.extend(pcm)
|
||||
if len(buf) > _MAX_BYTES:
|
||||
ps["buffers"][name] = bytearray(buf[-_MAX_BYTES:])
|
||||
ps["last_ts"][name] = time.monotonic()
|
||||
|
||||
|
||||
# -- STT: Whisper transcription ---------------------------------------------
|
||||
|
||||
|
||||
def _transcribe(ps, pcm: bytes) -> str:
|
||||
"""POST PCM (as WAV) to Whisper and return transcribed text. Blocking."""
|
||||
wav_data = _pcm_to_wav(pcm)
|
||||
boundary = "----derp_voice_boundary"
|
||||
body = (
|
||||
f"--{boundary}\r\n"
|
||||
f'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n'
|
||||
f"Content-Type: audio/wav\r\n\r\n"
|
||||
).encode() + wav_data + (
|
||||
f"\r\n--{boundary}\r\n"
|
||||
f'Content-Disposition: form-data; name="response_format"\r\n\r\n'
|
||||
f"json\r\n--{boundary}--\r\n"
|
||||
).encode()
|
||||
req = urllib.request.Request(ps["whisper_url"], data=body, method="POST")
|
||||
req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
|
||||
resp = _urlopen(req, timeout=30, proxy=False)
|
||||
data = json.loads(resp.read())
|
||||
resp.close()
|
||||
return data.get("text", "").strip()
|
||||
|
||||
|
||||
# -- STT: Flush monitor (asyncio background task) ---------------------------
|
||||
|
||||
|
||||
async def _flush_monitor(bot):
|
||||
"""Poll for silence gaps and transcribe completed utterances."""
|
||||
ps = _ps(bot)
|
||||
loop = asyncio.get_running_loop()
|
||||
try:
|
||||
while ps["listen"]:
|
||||
await asyncio.sleep(_FLUSH_INTERVAL)
|
||||
now = time.monotonic()
|
||||
to_flush: list[tuple[str, bytes]] = []
|
||||
|
||||
with ps["lock"]:
|
||||
for name in list(ps["last_ts"]):
|
||||
elapsed = now - ps["last_ts"][name]
|
||||
if elapsed >= ps["silence_gap"] and name in ps["buffers"]:
|
||||
pcm = bytes(ps["buffers"].pop(name))
|
||||
del ps["last_ts"][name]
|
||||
to_flush.append((name, pcm))
|
||||
|
||||
for name, pcm in to_flush:
|
||||
if len(pcm) < _MIN_BYTES:
|
||||
continue
|
||||
try:
|
||||
text = await loop.run_in_executor(
|
||||
None, _transcribe, ps, pcm,
|
||||
)
|
||||
except Exception:
|
||||
log.exception("voice: transcription failed for %s", name)
|
||||
continue
|
||||
if not text or text.strip("., ") == "":
|
||||
continue
|
||||
log.info("voice: %s said: %s", name, text)
|
||||
await bot.action("0", f"heard {name} say: {text}")
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("voice: flush monitor error")
|
||||
|
||||
|
||||
# -- TTS: Piper fetch + playback --------------------------------------------
|
||||
|
||||
|
||||
def _fetch_tts(url: str) -> str | None:
|
||||
"""Fetch TTS WAV from Piper to a temp file. Blocking."""
|
||||
import tempfile
|
||||
try:
|
||||
resp = _urlopen(url, timeout=30, proxy=False)
|
||||
data = resp.read()
|
||||
resp.close()
|
||||
if not data:
|
||||
return None
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
suffix=".wav", prefix="derp_tts_", delete=False,
|
||||
)
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
except Exception:
|
||||
log.exception("voice: TTS fetch failed")
|
||||
return None
|
||||
|
||||
|
||||
async def _tts_play(bot, text: str):
|
||||
"""Fetch TTS audio and play it via stream_audio."""
|
||||
from pathlib import Path
|
||||
|
||||
ps = _ps(bot)
|
||||
url = ps["piper_url"] + "?text=" + quote_plus(text)
|
||||
loop = asyncio.get_running_loop()
|
||||
wav_path = await loop.run_in_executor(None, _fetch_tts, url)
|
||||
if wav_path is None:
|
||||
return
|
||||
try:
|
||||
done = asyncio.Event()
|
||||
await bot.stream_audio(str(wav_path), volume=1.0, on_done=done)
|
||||
await done.wait()
|
||||
finally:
|
||||
Path(wav_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
# -- Listener lifecycle -----------------------------------------------------
|
||||
|
||||
|
||||
def _ensure_listener(bot):
|
||||
"""Register the sound listener callback (idempotent)."""
|
||||
ps = _ps(bot)
|
||||
if ps["_listener_registered"]:
|
||||
return
|
||||
if not hasattr(bot, "_sound_listeners"):
|
||||
return
|
||||
bot._sound_listeners.append(lambda user, chunk: _on_voice(bot, user, chunk))
|
||||
ps["_listener_registered"] = True
|
||||
log.info("voice: registered sound listener")
|
||||
|
||||
|
||||
def _ensure_flush_task(bot):
|
||||
"""Start the flush monitor if not running."""
|
||||
ps = _ps(bot)
|
||||
task = ps.get("flush_task")
|
||||
if task and not task.done():
|
||||
return
|
||||
ps["flush_task"] = bot._spawn(
|
||||
_flush_monitor(bot), name="voice-flush-monitor",
|
||||
)
|
||||
|
||||
|
||||
def _stop_flush_task(bot):
|
||||
"""Cancel the flush monitor."""
|
||||
ps = _ps(bot)
|
||||
task = ps.get("flush_task")
|
||||
if task and not task.done():
|
||||
task.cancel()
|
||||
ps["flush_task"] = None
|
||||
|
||||
|
||||
# -- Commands ----------------------------------------------------------------
|
||||
|
||||
|
||||
@command("listen", help="Voice: !listen [on|off] -- toggle STT", tier="admin")
|
||||
async def cmd_listen(bot, message):
|
||||
"""Toggle voice-to-text transcription."""
|
||||
if not _is_mumble(bot):
|
||||
await bot.reply(message, "Voice is Mumble-only")
|
||||
return
|
||||
|
||||
ps = _ps(bot)
|
||||
parts = message.text.split()
|
||||
if len(parts) < 2:
|
||||
state = "on" if ps["listen"] else "off"
|
||||
await bot.reply(message, f"Listen: {state}")
|
||||
return
|
||||
|
||||
sub = parts[1].lower()
|
||||
if sub == "on":
|
||||
ps["listen"] = True
|
||||
_ensure_listener(bot)
|
||||
_ensure_flush_task(bot)
|
||||
await bot.reply(message, "Listening for voice")
|
||||
elif sub == "off":
|
||||
ps["listen"] = False
|
||||
with ps["lock"]:
|
||||
ps["buffers"].clear()
|
||||
ps["last_ts"].clear()
|
||||
_stop_flush_task(bot)
|
||||
await bot.reply(message, "Stopped listening")
|
||||
else:
|
||||
await bot.reply(message, "Usage: !listen [on|off]")
|
||||
|
||||
|
||||
@command("say", help="Voice: !say <text> -- text-to-speech")
|
||||
async def cmd_say(bot, message):
|
||||
"""Speak text aloud via Piper TTS."""
|
||||
if not _is_mumble(bot):
|
||||
await bot.reply(message, "Voice is Mumble-only")
|
||||
return
|
||||
|
||||
parts = message.text.split(None, 1)
|
||||
if len(parts) < 2:
|
||||
await bot.reply(message, "Usage: !say <text>")
|
||||
return
|
||||
|
||||
text = parts[1].strip()
|
||||
if len(text) > _MAX_SAY_LEN:
|
||||
await bot.reply(message, f"Text too long (max {_MAX_SAY_LEN} chars)")
|
||||
return
|
||||
|
||||
bot._spawn(_tts_play(bot, text), name="voice-tts")
|
||||
|
||||
|
||||
# -- Plugin lifecycle --------------------------------------------------------
|
||||
|
||||
|
||||
async def on_connected(bot) -> None:
|
||||
"""Re-register listener after reconnect if listen was on."""
|
||||
if not _is_mumble(bot):
|
||||
return
|
||||
ps = _ps(bot)
|
||||
if ps["listen"]:
|
||||
_ensure_listener(bot)
|
||||
_ensure_flush_task(bot)
|
||||
Reference in New Issue
Block a user