app: replace regex html stripping with stdlib parser

Handles malformed tags, nested markup, and CDATA that the naive regex missed.
2026-02-24 16:23:03 +01:00
parent be6574ae79
commit 8be475f23f
1 changed files with 18 additions and 4 deletions
@@ -5,6 +5,7 @@ from __future__ import annotations
 import html
 import logging
 import time
 from html.parser import HTMLParser
 from textual import events, on, work
 from textual.app import App, ComposeResult
@@ -991,9 +992,22 @@ class TuimbleApp(App):
        self.exit()
 class _HTMLStripper(HTMLParser):
    """Extract text content from HTML, discarding all tags."""
    def __init__(self):
        super().__init__()
        self._parts: list[str] = []
    def handle_data(self, data: str):
        self._parts.append(data)
    def get_text(self) -> str:
        return "".join(self._parts)
 def _strip_html(text: str) -> str:
    """Remove HTML tags and unescape entities from Mumble messages."""
-    import re
+    stripper = _HTMLStripper()
-
+    stripper.feed(text)
-    clean = re.sub(r"<[^>]+>", "", text)
+    return html.unescape(stripper.get_text())
    return html.unescape(clean)