soup_parser: add stdlib HTMLParser fallback

2025-12-20 17:33:39 +01:00
parent 2a21bd44ed
commit 31a3ac9a8b
1 changed files with 103 additions and 6 deletions
--- a/soup_parser.py
+++ b/soup_parser.py
@@ -1,8 +1,105 @@
-from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
+"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
+
+from HTMLParser import HTMLParser
+import sys
+
+# Track if bs4 is available
+_bs4_available = False
+_use_bs4 = True
+
+try:
+	from bs4 import BeautifulSoup, FeatureNotFound
+	_bs4_available = True
+except ImportError:
+	_bs4_available = False
+
+
+class Tag(object):
+	"""Minimal BeautifulSoup Tag interface."""
+	def __init__(self, name, attrs):
+		self.name = name
+		self.attrs = dict(attrs)
+
+	def __getitem__(self, key):
+		return self.attrs.get(key)
+
+	def get(self, key, default=None):
+		return self.attrs.get(key, default)
+
+
+class SoupResult(object):
+	"""Minimal BeautifulSoup result interface."""
+	def __init__(self, tags):
+		self._tags = tags
+		self.body = self  # self-reference for soup.body.find_all()
+
+	def find_all(self, tag_name, **kwargs):
+		"""Find all tags matching criteria."""
+		results = []
+		for tag in self._tags:
+			if tag.name != tag_name:
+				continue
+			# check href=True means "has href attribute"
+			if 'href' in kwargs:
+				if kwargs['href'] is True and 'href' not in tag.attrs:
+					continue
+				elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
+					continue
+			results.append(tag)
+		return results
+
+
+class LinkExtractor(HTMLParser):
+	"""HTMLParser-based link extractor."""
+	def __init__(self):
+		HTMLParser.__init__(self)
+		self.tags = []
+
+	def handle_starttag(self, tag, attrs):
+		self.tags.append(Tag(tag, attrs))
+
+	def handle_startendtag(self, tag, attrs):
+		self.tags.append(Tag(tag, attrs))
+
+
+def _parse_stdlib(html):
+	"""Parse HTML using stdlib HTMLParser."""
+	parser = LinkExtractor()
+	try:
+		parser.feed(html)
+	except:
+		pass  # tolerate malformed HTML
+	return SoupResult(parser.tags)
+
+
+def _parse_bs4(html):
+	"""Parse HTML using BeautifulSoup."""
+	try:
+		return BeautifulSoup(html, 'lxml')
+	except (FeatureNotFound, Exception):
+		return BeautifulSoup(html, 'html.parser')
+
+
+def set_nobs(enabled):
+	"""Disable BeautifulSoup usage."""
+	global _use_bs4
+	_use_bs4 = not enabled
+	if enabled and _bs4_available:
+		sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
+	elif not _bs4_available:
+		sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
+

 def soupify(html, nohtml=False):
-	htm = html if nohtml else '<html><body>%s</body></html>'%(html)
-	try:
-		return BeautifulSoup(htm, 'lxml')
-	except FeatureNotFound:
-		return BeautifulSoup(htm, 'html.parser')
+	"""Parse HTML and return soup-like object."""
+	htm = html if nohtml else '<html><body>%s</body></html>' % (html)
+
+	if _use_bs4 and _bs4_available:
+		return _parse_bs4(htm)
+	else:
+		return _parse_stdlib(htm)
+
+
+def is_available():
+	"""Check if BeautifulSoup is available."""
+	return _bs4_available