app: replace regex html stripping with stdlib parser

Handles malformed tags, nested markup, and CDATA that the naive regex missed.
2026-02-24 16:23:03 +01:00
parent be6574ae79
commit 8be475f23f
1 changed files with 18 additions and 4 deletions
--- a/src/tuimble/app.py
+++ b/src/tuimble/app.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 import html
 import logging
 import time
+from html.parser import HTMLParser

 from textual import events, on, work
 from textual.app import App, ComposeResult
@@ -991,9 +992,22 @@ class TuimbleApp(App):
        self.exit()


+class _HTMLStripper(HTMLParser):
+    """Extract text content from HTML, discarding all tags."""
+
+    def __init__(self):
+        super().__init__()
+        self._parts: list[str] = []
+
+    def handle_data(self, data: str):
+        self._parts.append(data)
+
+    def get_text(self) -> str:
+        return "".join(self._parts)
+
+
 def _strip_html(text: str) -> str:
    """Remove HTML tags and unescape entities from Mumble messages."""
-    import re
-
-    clean = re.sub(r"<[^>]+>", "", text)
-    return html.unescape(clean)
+    stripper = _HTMLStripper()
+    stripper.feed(text)
+    return html.unescape(stripper.get_text())