"""HTML parsing with optional BeautifulSoup or stdlib fallback.""" from HTMLParser import HTMLParser import sys # Track if bs4 is available _bs4_available = False _use_bs4 = True try: from bs4 import BeautifulSoup, FeatureNotFound _bs4_available = True except ImportError: _bs4_available = False class Tag(object): """Minimal BeautifulSoup Tag interface.""" def __init__(self, name, attrs): self.name = name self.attrs = dict(attrs) def __getitem__(self, key): return self.attrs.get(key) def get(self, key, default=None): return self.attrs.get(key, default) class SoupResult(object): """Minimal BeautifulSoup result interface.""" def __init__(self, tags): self._tags = tags self.body = self # self-reference for soup.body.find_all() def find_all(self, tag_name, **kwargs): """Find all tags matching criteria.""" results = [] for tag in self._tags: if tag.name != tag_name: continue # check href=True means "has href attribute" if 'href' in kwargs: if kwargs['href'] is True and 'href' not in tag.attrs: continue elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']: continue results.append(tag) return results class LinkExtractor(HTMLParser): """HTMLParser-based link extractor.""" def __init__(self): HTMLParser.__init__(self) self.tags = [] def handle_starttag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def handle_startendtag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def _parse_stdlib(html): """Parse HTML using stdlib HTMLParser.""" parser = LinkExtractor() try: parser.feed(html) except: pass # tolerate malformed HTML return SoupResult(parser.tags) def _parse_bs4(html): """Parse HTML using BeautifulSoup.""" try: return BeautifulSoup(html, 'lxml') except (FeatureNotFound, Exception): return BeautifulSoup(html, 'html.parser') def set_nobs(enabled): """Disable BeautifulSoup usage.""" global _use_bs4 _use_bs4 = not enabled if enabled and _bs4_available: sys.stderr.write('info: --nobs: using stdlib HTMLParser\n') elif not _bs4_available: sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n') def soupify(html, nohtml=False): """Parse HTML and return soup-like object.""" htm = html if nohtml else '
%s' % (html) if _use_bs4 and _bs4_available: return _parse_bs4(htm) else: return _parse_stdlib(htm) def is_available(): """Check if BeautifulSoup is available.""" return _bs4_available