ppf/soup_parser.py

"""HTML parsing with optional BeautifulSoup or stdlib fallback."""

from HTMLParser import HTMLParser
import sys

# Track if bs4 is available
_bs4_available = False
_use_bs4 = True

try:
	from bs4 import BeautifulSoup, FeatureNotFound
	_bs4_available = True
except ImportError:
	_bs4_available = False


class Tag(object):
	"""Minimal BeautifulSoup Tag interface."""
	def __init__(self, name, attrs):
		self.name = name
		self.attrs = dict(attrs)

	def __getitem__(self, key):
		return self.attrs.get(key)

	def get(self, key, default=None):
		return self.attrs.get(key, default)


class SoupResult(object):
	"""Minimal BeautifulSoup result interface."""
	def __init__(self, tags):
		self._tags = tags
		self.body = self  # self-reference for soup.body.find_all()

	def find_all(self, tag_name, **kwargs):
		"""Find all tags matching criteria."""
		results = []
		for tag in self._tags:
			if tag.name != tag_name:
				continue
			# check href=True means "has href attribute"
			if 'href' in kwargs:
				if kwargs['href'] is True and 'href' not in tag.attrs:
					continue
				elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
					continue
			results.append(tag)
		return results


class LinkExtractor(HTMLParser):
	"""HTMLParser-based link extractor."""
	def __init__(self):
		HTMLParser.__init__(self)
		self.tags = []

	def handle_starttag(self, tag, attrs):
		self.tags.append(Tag(tag, attrs))

	def handle_startendtag(self, tag, attrs):
		self.tags.append(Tag(tag, attrs))


def _parse_stdlib(html):
	"""Parse HTML using stdlib HTMLParser."""
	parser = LinkExtractor()
	try:
		parser.feed(html)
	except:
		pass  # tolerate malformed HTML
	return SoupResult(parser.tags)


def _parse_bs4(html):
	"""Parse HTML using BeautifulSoup."""
	try:
		return BeautifulSoup(html, 'lxml')
	except (FeatureNotFound, Exception):
		return BeautifulSoup(html, 'html.parser')


def set_nobs(enabled):
	"""Disable BeautifulSoup usage."""
	global _use_bs4
	_use_bs4 = not enabled
	if enabled and _bs4_available:
		sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
	elif not _bs4_available:
		sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')


def soupify(html, nohtml=False):
	"""Parse HTML and return soup-like object."""
	htm = html if nohtml else '<html><body>%s</body></html>' % (html)

	if _use_bs4 and _bs4_available:
		return _parse_bs4(htm)
	else:
		return _parse_stdlib(htm)


def is_available():
	"""Check if BeautifulSoup is available."""
	return _bs4_available