app: replace regex html stripping with stdlib parser

Handles malformed tags, nested markup, and CDATA that the
naive regex missed.
This commit is contained in:
Username
2026-02-24 16:23:03 +01:00
parent be6574ae79
commit 8be475f23f

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import html
import logging
import time
from html.parser import HTMLParser
from textual import events, on, work
from textual.app import App, ComposeResult
@@ -991,9 +992,22 @@ class TuimbleApp(App):
self.exit()
class _HTMLStripper(HTMLParser):
"""Extract text content from HTML, discarding all tags."""
def __init__(self):
super().__init__()
self._parts: list[str] = []
def handle_data(self, data: str):
self._parts.append(data)
def get_text(self) -> str:
return "".join(self._parts)
def _strip_html(text: str) -> str:
"""Remove HTML tags and unescape entities from Mumble messages."""
import re
clean = re.sub(r"<[^>]+>", "", text)
return html.unescape(clean)
stripper = _HTMLStripper()
stripper.feed(text)
return html.unescape(stripper.get_text())