style: normalize indentation and improve code style

- convert tabs to 4-space indentation - add docstrings to modules and classes - remove unused import (copy) - use explicit object inheritance - use 'while True' over 'while 1' - use 'while args' over 'while len(args)' - use '{}' over 'dict()' - consistent string formatting - Python 2/3 compatible Queue import
2025-12-20 23:18:45 +01:00
parent d356cdf6ee
commit e24f68500c
8 changed files with 1434 additions and 1342 deletions
--- a/soup_parser.py
+++ b/soup_parser.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python2
-# HTML parsing with optional BeautifulSoup or stdlib fallback
+# -*- coding: utf-8 -*-
+"""HTML parsing with optional BeautifulSoup or stdlib fallback."""

 from HTMLParser import HTMLParser
 import sys
@@ -8,87 +9,98 @@ _bs4_available = False
 _use_bs4 = True

 try:
-	from bs4 import BeautifulSoup, FeatureNotFound
-	_bs4_available = True
+    from bs4 import BeautifulSoup, FeatureNotFound
+    _bs4_available = True
 except ImportError:
-	_bs4_available = False
+    _bs4_available = False


-class Tag():
-	def __init__(self, name, attrs):
-		self.name = name
-		self.attrs = dict(attrs)
+class Tag(object):
+    """Simple tag representation for stdlib parser."""

-	def __getitem__(self, key):
-		return self.attrs.get(key)
+    def __init__(self, name, attrs):
+        self.name = name
+        self.attrs = dict(attrs)

-	def get(self, key, default=None):
-		return self.attrs.get(key, default)
+    def __getitem__(self, key):
+        return self.attrs.get(key)
+
+    def get(self, key, default=None):
+        return self.attrs.get(key, default)


-class SoupResult():
-	def __init__(self, tags):
-		self._tags = tags
-		self.body = self
+class SoupResult(object):
+    """BeautifulSoup-like result wrapper for stdlib parser."""

-	def find_all(self, tag_name, **kwargs):
-		results = []
-		for tag in self._tags:
-			if tag.name != tag_name:
-				continue
-			if 'href' in kwargs:
-				if kwargs['href'] is True and 'href' not in tag.attrs:
-					continue
-				elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
-					continue
-			results.append(tag)
-		return results
+    def __init__(self, tags):
+        self._tags = tags
+        self.body = self
+
+    def find_all(self, tag_name, **kwargs):
+        """Find all tags matching criteria."""
+        results = []
+        for tag in self._tags:
+            if tag.name != tag_name:
+                continue
+            if 'href' in kwargs:
+                if kwargs['href'] is True and 'href' not in tag.attrs:
+                    continue
+                elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
+                    continue
+            results.append(tag)
+        return results


 class LinkExtractor(HTMLParser):
-	def __init__(self):
-		HTMLParser.__init__(self)
-		self.tags = []
+    """Extract tags from HTML using stdlib."""

-	def handle_starttag(self, tag, attrs):
-		self.tags.append(Tag(tag, attrs))
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.tags = []

-	def handle_startendtag(self, tag, attrs):
-		self.tags.append(Tag(tag, attrs))
+    def handle_starttag(self, tag, attrs):
+        self.tags.append(Tag(tag, attrs))
+
+    def handle_startendtag(self, tag, attrs):
+        self.tags.append(Tag(tag, attrs))


 def _parse_stdlib(html):
-	parser = LinkExtractor()
-	try:
-		parser.feed(html)
-	except Exception:
-		pass  # malformed HTML, return partial results
-	return SoupResult(parser.tags)
+    """Parse HTML using stdlib HTMLParser."""
+    parser = LinkExtractor()
+    try:
+        parser.feed(html)
+    except Exception:
+        pass  # Malformed HTML, return partial results
+    return SoupResult(parser.tags)


 def _parse_bs4(html):
-	try:
-		return BeautifulSoup(html, 'lxml')
-	except (FeatureNotFound, Exception):
-		return BeautifulSoup(html, 'html.parser')
+    """Parse HTML using BeautifulSoup."""
+    try:
+        return BeautifulSoup(html, 'lxml')
+    except (FeatureNotFound, Exception):
+        return BeautifulSoup(html, 'html.parser')


 def set_nobs(enabled):
-	global _use_bs4
-	_use_bs4 = not enabled
-	if enabled and _bs4_available:
-		sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
-	elif not _bs4_available:
-		sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
+    """Disable BeautifulSoup and use stdlib instead."""
+    global _use_bs4
+    _use_bs4 = not enabled
+    if enabled and _bs4_available:
+        sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
+    elif not _bs4_available:
+        sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')


 def soupify(html, nohtml=False):
-	htm = html if nohtml else '<html><body>%s</body></html>' % (html)
-	if _use_bs4 and _bs4_available:
-		return _parse_bs4(htm)
-	else:
-		return _parse_stdlib(htm)
+    """Parse HTML content, returning BeautifulSoup-like object."""
+    htm = html if nohtml else '<html><body>%s</body></html>' % html
+    if _use_bs4 and _bs4_available:
+        return _parse_bs4(htm)
+    return _parse_stdlib(htm)


 def is_available():
-	return _bs4_available
+    """Check if BeautifulSoup is available."""
+    return _bs4_available