feat: voice profiles, rubberband FX, per-bot plugin filtering

- Add rubberband package to container for pitch-shifting FX - Split FX chain: rubberband CLI for pitch, ffmpeg for filters - Configurable voice profile (voice, fx, piper params) in [voice] - Extra bots inherit voice config (minus trigger) for own TTS - Greeting is voice-only, spoken directly by the greeting bot - Per-bot only_plugins/except_plugins filtering on Mumble - Alias plugin, core plugin tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 11:41:00 +01:00
parent 3afeace6e7
commit e9d17e8b00
13 changed files with 1398 additions and 111 deletions
--- a/plugins/voice.py
+++ b/plugins/voice.py
@@ -54,6 +54,11 @@ def _ps(bot):
        "silence_gap": cfg.get("silence_gap", _SILENCE_GAP),
        "whisper_url": cfg.get("whisper_url", _WHISPER_URL),
        "piper_url": cfg.get("piper_url", _PIPER_URL),
+        "voice": cfg.get("voice", ""),
+        "length_scale": cfg.get("length_scale", 1.0),
+        "noise_scale": cfg.get("noise_scale", 0.667),
+        "noise_w": cfg.get("noise_w", 0.8),
+        "fx": cfg.get("fx", ""),
        "_listener_registered": False,
    })

@@ -210,14 +215,30 @@ def _fetch_tts(piper_url: str, text: str) -> str | None:


 async def _tts_play(bot, text: str):
-    """Fetch TTS audio and play it via stream_audio."""
+    """Fetch TTS audio and play it via stream_audio.
+
+    Uses the configured voice profile (voice, fx, piper params) when set,
+    otherwise falls back to Piper's default voice.
+    """
    from pathlib import Path

    ps = _ps(bot)
    loop = asyncio.get_running_loop()
-    wav_path = await loop.run_in_executor(
-        None, _fetch_tts, ps["piper_url"], text,
-    )
+    if ps["voice"] or ps["fx"]:
+        wav_path = await loop.run_in_executor(
+            None, lambda: _fetch_tts_voice(
+                ps["piper_url"], text,
+                voice=ps["voice"],
+                length_scale=ps["length_scale"],
+                noise_scale=ps["noise_scale"],
+                noise_w=ps["noise_w"],
+                fx=ps["fx"],
+            ),
+        )
+    else:
+        wav_path = await loop.run_in_executor(
+            None, _fetch_tts, ps["piper_url"], text,
+        )
    if wav_path is None:
        return
    try:
@@ -322,26 +343,228 @@ async def cmd_say(bot, message):
    bot._spawn(_tts_play(bot, text), name="voice-tts")


+def _split_fx(fx: str) -> tuple[list[str], str]:
+    """Split FX chain into rubberband CLI args and ffmpeg filter string.
+
+    Alpine's ffmpeg lacks librubberband, so pitch shifting is handled by
+    the ``rubberband`` CLI tool and remaining filters by ffmpeg.
+    """
+    import math
+    parts = fx.split(",")
+    rb_args: list[str] = []
+    ff_parts: list[str] = []
+    for part in parts:
+        if part.startswith("rubberband="):
+            opts: dict[str, str] = {}
+            for kv in part[len("rubberband="):].split(":"):
+                k, _, v = kv.partition("=")
+                opts[k] = v
+            if "pitch" in opts:
+                semitones = 12 * math.log2(float(opts["pitch"]))
+                rb_args += ["--pitch", f"{semitones:.2f}"]
+            if opts.get("formant") == "1":
+                rb_args.append("--formant")
+        else:
+            ff_parts.append(part)
+    return rb_args, ",".join(ff_parts)
+
+
+def _fetch_tts_voice(piper_url: str, text: str, *, voice: str = "",
+                     speaker_id: int = 0, length_scale: float = 1.0,
+                     noise_scale: float = 0.667, noise_w: float = 0.8,
+                     fx: str = "") -> str | None:
+    """Fetch TTS with explicit voice params and optional FX.  Blocking.
+
+    Pitch shifting uses the ``rubberband`` CLI (Alpine ffmpeg has no
+    librubberband); remaining audio filters go through ffmpeg.
+    """
+    import os
+    import subprocess
+    import tempfile
+    payload = {"text": text}
+    if voice:
+        payload["voice"] = voice
+    if speaker_id:
+        payload["speaker_id"] = speaker_id
+    payload["length_scale"] = length_scale
+    payload["noise_scale"] = noise_scale
+    payload["noise_w"] = noise_w
+    data = json.dumps(payload).encode()
+    req = urllib.request.Request(piper_url, data=data, method="POST")
+    req.add_header("Content-Type", "application/json")
+    resp = _urlopen(req, timeout=30, proxy=False)
+    wav_data = resp.read()
+    resp.close()
+    if not wav_data:
+        return None
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", prefix="derp_aud_", delete=False)
+    tmp.write(wav_data)
+    tmp.close()
+    if not fx:
+        return tmp.name
+
+    rb_args, ff_filters = _split_fx(fx)
+    current = tmp.name
+
+    # Pitch shift via rubberband CLI
+    if rb_args:
+        rb_out = tempfile.NamedTemporaryFile(
+            suffix=".wav", prefix="derp_aud_", delete=False,
+        )
+        rb_out.close()
+        r = subprocess.run(
+            ["rubberband"] + rb_args + [current, rb_out.name],
+            capture_output=True, timeout=15,
+        )
+        os.unlink(current)
+        if r.returncode != 0:
+            log.warning("voice: rubberband failed: %s", r.stderr[:200])
+            os.unlink(rb_out.name)
+            return None
+        current = rb_out.name
+
+    # Remaining filters via ffmpeg
+    if ff_filters:
+        ff_out = tempfile.NamedTemporaryFile(
+            suffix=".wav", prefix="derp_aud_", delete=False,
+        )
+        ff_out.close()
+        r = subprocess.run(
+            ["ffmpeg", "-y", "-i", current, "-af", ff_filters, ff_out.name],
+            capture_output=True, timeout=15,
+        )
+        os.unlink(current)
+        if r.returncode != 0:
+            log.warning("voice: ffmpeg failed: %s", r.stderr[:200])
+            os.unlink(ff_out.name)
+            return None
+        current = ff_out.name
+
+    return current
+
+
+@command("audition", help="Voice: !audition -- play voice samples", tier="admin")
+async def cmd_audition(bot, message):
+    """Play voice samples through Mumble for comparison."""
+    if not _is_mumble(bot):
+        return
+
+    ps = _ps(bot)
+    piper_url = ps["piper_url"]
+    phrase = "The sorcerer has arrived. I have seen things beyond your understanding."
+
+    # FX building blocks
+    _deep = "rubberband=pitch=0.87:formant=1"
+    _bass = "bass=g=6:f=110:w=0.6"
+    _bass_heavy = "equalizer=f=80:t=h:w=150:g=8"
+    _echo_subtle = "aecho=0.8:0.6:25|40:0.25|0.15"
+    _echo_chamber = "aecho=0.8:0.88:60:0.35"
+    _echo_cave = "aecho=0.8:0.7:40|70|100:0.3|0.2|0.1"
+
+    samples = [
+        # -- Base voices (no FX) for reference
+        ("ryan-high raw", "en_US-ryan-high", 0, ""),
+        ("lessac-high raw", "en_US-lessac-high", 0, ""),
+        # -- Deep pitch only
+        ("ryan deep", "en_US-ryan-high", 0,
+         _deep),
+        ("lessac deep", "en_US-lessac-high", 0,
+         _deep),
+        # -- Deep + bass boost
+        ("ryan deep+bass", "en_US-ryan-high", 0,
+         f"{_deep},{_bass}"),
+        ("lessac deep+bass", "en_US-lessac-high", 0,
+         f"{_deep},{_bass}"),
+        # -- Deep + heavy bass
+        ("ryan deep+heavy bass", "en_US-ryan-high", 0,
+         f"{_deep},{_bass_heavy}"),
+        # -- Deep + bass + subtle echo
+        ("ryan deep+bass+echo", "en_US-ryan-high", 0,
+         f"{_deep},{_bass},{_echo_subtle}"),
+        ("lessac deep+bass+echo", "en_US-lessac-high", 0,
+         f"{_deep},{_bass},{_echo_subtle}"),
+        # -- Deep + bass + chamber reverb
+        ("ryan deep+bass+chamber", "en_US-ryan-high", 0,
+         f"{_deep},{_bass},{_echo_chamber}"),
+        ("lessac deep+bass+chamber", "en_US-lessac-high", 0,
+         f"{_deep},{_bass},{_echo_chamber}"),
+        # -- Deep + heavy bass + cave reverb
+        ("ryan deep+heavybass+cave", "en_US-ryan-high", 0,
+         f"{_deep},{_bass_heavy},{_echo_cave}"),
+        # -- Libritts best candidates with full sorcerer chain
+        ("libritts #20 deep+bass+echo", "en_US-libritts_r-medium", 20,
+         f"{_deep},{_bass},{_echo_subtle}"),
+        ("libritts #22 deep+bass+echo", "en_US-libritts_r-medium", 22,
+         f"{_deep},{_bass},{_echo_subtle}"),
+        ("libritts #79 deep+bass+chamber", "en_US-libritts_r-medium", 79,
+         f"{_deep},{_bass},{_echo_chamber}"),
+    ]
+
+    # Find merlin (the listener bot) -- plays the audition samples
+    merlin = None
+    for peer in getattr(bot.registry, "_bots", {}).values():
+        if getattr(peer, "_receive_sound", False):
+            merlin = peer
+            break
+
+    await bot.reply(message, f"Auditioning {len(samples)} voice samples...")
+    loop = asyncio.get_running_loop()
+    from pathlib import Path
+
+    # Pre-generate derp's default voice (same phrase, no FX)
+    derp_wav = await loop.run_in_executor(
+        None, lambda: _fetch_tts_voice(piper_url, phrase),
+    )
+
+    for i, (label, voice, sid, fx) in enumerate(samples, 1):
+        announcer = merlin or bot
+        await announcer.send("0", f"[{i}/{len(samples)}] {label}")
+        await asyncio.sleep(1)
+        # Generate the audition sample (merlin's candidate voice)
+        sample_wav = await loop.run_in_executor(
+            None, lambda v=voice, s=sid, f=fx: _fetch_tts_voice(
+                piper_url, phrase, voice=v, speaker_id=s,
+                length_scale=1.15, noise_scale=0.4, noise_w=0.5, fx=f,
+            ),
+        )
+        if sample_wav is None:
+            await bot.send("0", "  (failed)")
+            continue
+        try:
+            # Both bots speak simultaneously:
+            # merlin plays the audition sample, derp plays its default voice
+            merlin_done = asyncio.Event()
+            derp_done = asyncio.Event()
+            if merlin:
+                merlin_task = asyncio.create_task(
+                    merlin.stream_audio(sample_wav, volume=1.0,
+                                        on_done=merlin_done))
+                derp_task = asyncio.create_task(
+                    bot.stream_audio(derp_wav, volume=1.0,
+                                     on_done=derp_done))
+                await asyncio.gather(merlin_task, derp_task)
+            else:
+                await bot.stream_audio(sample_wav, volume=1.0,
+                                       on_done=merlin_done)
+                await merlin_done.wait()
+        finally:
+            Path(sample_wav).unlink(missing_ok=True)
+        await asyncio.sleep(2)
+
+    if derp_wav:
+        Path(derp_wav).unlink(missing_ok=True)
+    announcer = merlin or bot
+    await announcer.send("0", "Audition complete.")
+
+
 # -- Plugin lifecycle --------------------------------------------------------


 async def on_connected(bot) -> None:
-    """Re-register listener after reconnect; play TTS greeting on first join."""
+    """Re-register listener after reconnect."""
    if not _is_mumble(bot):
        return
    ps = _ps(bot)
-
-    # TTS greeting on first connect
-    greet = bot.config.get("mumble", {}).get("greet")
-    if greet and not ps.get("_greeted"):
-        ps["_greeted"] = True
-        # Wait for audio subsystem to be ready
-        for _ in range(20):
-            if bot._is_audio_ready():
-                break
-            await asyncio.sleep(0.5)
-        bot._spawn(_tts_play(bot, greet), name="voice-greet")
-
    if ps["listen"] or ps["trigger"]:
        _ensure_listener(bot)
        _ensure_flush_task(bot)