feat: voice profiles, rubberband FX, per-bot plugin filtering

- Add rubberband package to container for pitch-shifting FX
- Split FX chain: rubberband CLI for pitch, ffmpeg for filters
- Configurable voice profile (voice, fx, piper params) in [voice]
- Extra bots inherit voice config (minus trigger) for own TTS
- Greeting is voice-only, spoken directly by the greeting bot
- Per-bot only_plugins/except_plugins filtering on Mumble
- Alias plugin, core plugin tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-22 11:41:00 +01:00
parent 3afeace6e7
commit e9d17e8b00
13 changed files with 1398 additions and 111 deletions

View File

@@ -54,6 +54,11 @@ def _ps(bot):
"silence_gap": cfg.get("silence_gap", _SILENCE_GAP),
"whisper_url": cfg.get("whisper_url", _WHISPER_URL),
"piper_url": cfg.get("piper_url", _PIPER_URL),
"voice": cfg.get("voice", ""),
"length_scale": cfg.get("length_scale", 1.0),
"noise_scale": cfg.get("noise_scale", 0.667),
"noise_w": cfg.get("noise_w", 0.8),
"fx": cfg.get("fx", ""),
"_listener_registered": False,
})
@@ -210,14 +215,30 @@ def _fetch_tts(piper_url: str, text: str) -> str | None:
async def _tts_play(bot, text: str):
"""Fetch TTS audio and play it via stream_audio."""
"""Fetch TTS audio and play it via stream_audio.
Uses the configured voice profile (voice, fx, piper params) when set,
otherwise falls back to Piper's default voice.
"""
from pathlib import Path
ps = _ps(bot)
loop = asyncio.get_running_loop()
wav_path = await loop.run_in_executor(
None, _fetch_tts, ps["piper_url"], text,
)
if ps["voice"] or ps["fx"]:
wav_path = await loop.run_in_executor(
None, lambda: _fetch_tts_voice(
ps["piper_url"], text,
voice=ps["voice"],
length_scale=ps["length_scale"],
noise_scale=ps["noise_scale"],
noise_w=ps["noise_w"],
fx=ps["fx"],
),
)
else:
wav_path = await loop.run_in_executor(
None, _fetch_tts, ps["piper_url"], text,
)
if wav_path is None:
return
try:
@@ -322,26 +343,228 @@ async def cmd_say(bot, message):
bot._spawn(_tts_play(bot, text), name="voice-tts")
def _split_fx(fx: str) -> tuple[list[str], str]:
"""Split FX chain into rubberband CLI args and ffmpeg filter string.
Alpine's ffmpeg lacks librubberband, so pitch shifting is handled by
the ``rubberband`` CLI tool and remaining filters by ffmpeg.
"""
import math
parts = fx.split(",")
rb_args: list[str] = []
ff_parts: list[str] = []
for part in parts:
if part.startswith("rubberband="):
opts: dict[str, str] = {}
for kv in part[len("rubberband="):].split(":"):
k, _, v = kv.partition("=")
opts[k] = v
if "pitch" in opts:
semitones = 12 * math.log2(float(opts["pitch"]))
rb_args += ["--pitch", f"{semitones:.2f}"]
if opts.get("formant") == "1":
rb_args.append("--formant")
else:
ff_parts.append(part)
return rb_args, ",".join(ff_parts)
def _fetch_tts_voice(piper_url: str, text: str, *, voice: str = "",
speaker_id: int = 0, length_scale: float = 1.0,
noise_scale: float = 0.667, noise_w: float = 0.8,
fx: str = "") -> str | None:
"""Fetch TTS with explicit voice params and optional FX. Blocking.
Pitch shifting uses the ``rubberband`` CLI (Alpine ffmpeg has no
librubberband); remaining audio filters go through ffmpeg.
"""
import os
import subprocess
import tempfile
payload = {"text": text}
if voice:
payload["voice"] = voice
if speaker_id:
payload["speaker_id"] = speaker_id
payload["length_scale"] = length_scale
payload["noise_scale"] = noise_scale
payload["noise_w"] = noise_w
data = json.dumps(payload).encode()
req = urllib.request.Request(piper_url, data=data, method="POST")
req.add_header("Content-Type", "application/json")
resp = _urlopen(req, timeout=30, proxy=False)
wav_data = resp.read()
resp.close()
if not wav_data:
return None
tmp = tempfile.NamedTemporaryFile(suffix=".wav", prefix="derp_aud_", delete=False)
tmp.write(wav_data)
tmp.close()
if not fx:
return tmp.name
rb_args, ff_filters = _split_fx(fx)
current = tmp.name
# Pitch shift via rubberband CLI
if rb_args:
rb_out = tempfile.NamedTemporaryFile(
suffix=".wav", prefix="derp_aud_", delete=False,
)
rb_out.close()
r = subprocess.run(
["rubberband"] + rb_args + [current, rb_out.name],
capture_output=True, timeout=15,
)
os.unlink(current)
if r.returncode != 0:
log.warning("voice: rubberband failed: %s", r.stderr[:200])
os.unlink(rb_out.name)
return None
current = rb_out.name
# Remaining filters via ffmpeg
if ff_filters:
ff_out = tempfile.NamedTemporaryFile(
suffix=".wav", prefix="derp_aud_", delete=False,
)
ff_out.close()
r = subprocess.run(
["ffmpeg", "-y", "-i", current, "-af", ff_filters, ff_out.name],
capture_output=True, timeout=15,
)
os.unlink(current)
if r.returncode != 0:
log.warning("voice: ffmpeg failed: %s", r.stderr[:200])
os.unlink(ff_out.name)
return None
current = ff_out.name
return current
@command("audition", help="Voice: !audition -- play voice samples", tier="admin")
async def cmd_audition(bot, message):
"""Play voice samples through Mumble for comparison."""
if not _is_mumble(bot):
return
ps = _ps(bot)
piper_url = ps["piper_url"]
phrase = "The sorcerer has arrived. I have seen things beyond your understanding."
# FX building blocks
_deep = "rubberband=pitch=0.87:formant=1"
_bass = "bass=g=6:f=110:w=0.6"
_bass_heavy = "equalizer=f=80:t=h:w=150:g=8"
_echo_subtle = "aecho=0.8:0.6:25|40:0.25|0.15"
_echo_chamber = "aecho=0.8:0.88:60:0.35"
_echo_cave = "aecho=0.8:0.7:40|70|100:0.3|0.2|0.1"
samples = [
# -- Base voices (no FX) for reference
("ryan-high raw", "en_US-ryan-high", 0, ""),
("lessac-high raw", "en_US-lessac-high", 0, ""),
# -- Deep pitch only
("ryan deep", "en_US-ryan-high", 0,
_deep),
("lessac deep", "en_US-lessac-high", 0,
_deep),
# -- Deep + bass boost
("ryan deep+bass", "en_US-ryan-high", 0,
f"{_deep},{_bass}"),
("lessac deep+bass", "en_US-lessac-high", 0,
f"{_deep},{_bass}"),
# -- Deep + heavy bass
("ryan deep+heavy bass", "en_US-ryan-high", 0,
f"{_deep},{_bass_heavy}"),
# -- Deep + bass + subtle echo
("ryan deep+bass+echo", "en_US-ryan-high", 0,
f"{_deep},{_bass},{_echo_subtle}"),
("lessac deep+bass+echo", "en_US-lessac-high", 0,
f"{_deep},{_bass},{_echo_subtle}"),
# -- Deep + bass + chamber reverb
("ryan deep+bass+chamber", "en_US-ryan-high", 0,
f"{_deep},{_bass},{_echo_chamber}"),
("lessac deep+bass+chamber", "en_US-lessac-high", 0,
f"{_deep},{_bass},{_echo_chamber}"),
# -- Deep + heavy bass + cave reverb
("ryan deep+heavybass+cave", "en_US-ryan-high", 0,
f"{_deep},{_bass_heavy},{_echo_cave}"),
# -- Libritts best candidates with full sorcerer chain
("libritts #20 deep+bass+echo", "en_US-libritts_r-medium", 20,
f"{_deep},{_bass},{_echo_subtle}"),
("libritts #22 deep+bass+echo", "en_US-libritts_r-medium", 22,
f"{_deep},{_bass},{_echo_subtle}"),
("libritts #79 deep+bass+chamber", "en_US-libritts_r-medium", 79,
f"{_deep},{_bass},{_echo_chamber}"),
]
# Find merlin (the listener bot) -- plays the audition samples
merlin = None
for peer in getattr(bot.registry, "_bots", {}).values():
if getattr(peer, "_receive_sound", False):
merlin = peer
break
await bot.reply(message, f"Auditioning {len(samples)} voice samples...")
loop = asyncio.get_running_loop()
from pathlib import Path
# Pre-generate derp's default voice (same phrase, no FX)
derp_wav = await loop.run_in_executor(
None, lambda: _fetch_tts_voice(piper_url, phrase),
)
for i, (label, voice, sid, fx) in enumerate(samples, 1):
announcer = merlin or bot
await announcer.send("0", f"[{i}/{len(samples)}] {label}")
await asyncio.sleep(1)
# Generate the audition sample (merlin's candidate voice)
sample_wav = await loop.run_in_executor(
None, lambda v=voice, s=sid, f=fx: _fetch_tts_voice(
piper_url, phrase, voice=v, speaker_id=s,
length_scale=1.15, noise_scale=0.4, noise_w=0.5, fx=f,
),
)
if sample_wav is None:
await bot.send("0", " (failed)")
continue
try:
# Both bots speak simultaneously:
# merlin plays the audition sample, derp plays its default voice
merlin_done = asyncio.Event()
derp_done = asyncio.Event()
if merlin:
merlin_task = asyncio.create_task(
merlin.stream_audio(sample_wav, volume=1.0,
on_done=merlin_done))
derp_task = asyncio.create_task(
bot.stream_audio(derp_wav, volume=1.0,
on_done=derp_done))
await asyncio.gather(merlin_task, derp_task)
else:
await bot.stream_audio(sample_wav, volume=1.0,
on_done=merlin_done)
await merlin_done.wait()
finally:
Path(sample_wav).unlink(missing_ok=True)
await asyncio.sleep(2)
if derp_wav:
Path(derp_wav).unlink(missing_ok=True)
announcer = merlin or bot
await announcer.send("0", "Audition complete.")
# -- Plugin lifecycle --------------------------------------------------------
async def on_connected(bot) -> None:
"""Re-register listener after reconnect; play TTS greeting on first join."""
"""Re-register listener after reconnect."""
if not _is_mumble(bot):
return
ps = _ps(bot)
# TTS greeting on first connect
greet = bot.config.get("mumble", {}).get("greet")
if greet and not ps.get("_greeted"):
ps["_greeted"] = True
# Wait for audio subsystem to be ready
for _ in range(20):
if bot._is_audio_ready():
break
await asyncio.sleep(0.5)
bot._spawn(_tts_play(bot, greet), name="voice-greet")
if ps["listen"] or ps["trigger"]:
_ensure_listener(bot)
_ensure_flush_task(bot)