app: replace regex html stripping with stdlib parser

Handles malformed tags, nested markup, and CDATA that the
naive regex missed.
This commit is contained in:
Username
2026-02-24 16:23:03 +01:00
parent be6574ae79
commit 8be475f23f

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import html import html
import logging import logging
import time import time
from html.parser import HTMLParser
from textual import events, on, work from textual import events, on, work
from textual.app import App, ComposeResult from textual.app import App, ComposeResult
@@ -991,9 +992,22 @@ class TuimbleApp(App):
self.exit() self.exit()
class _HTMLStripper(HTMLParser):
"""Extract text content from HTML, discarding all tags."""
def __init__(self):
super().__init__()
self._parts: list[str] = []
def handle_data(self, data: str):
self._parts.append(data)
def get_text(self) -> str:
return "".join(self._parts)
def _strip_html(text: str) -> str: def _strip_html(text: str) -> str:
"""Remove HTML tags and unescape entities from Mumble messages.""" """Remove HTML tags and unescape entities from Mumble messages."""
import re stripper = _HTMLStripper()
stripper.feed(text)
clean = re.sub(r"<[^>]+>", "", text) return html.unescape(stripper.get_text())
return html.unescape(clean)