#!/usr/bin/env python2 # HTML parsing with optional BeautifulSoup or stdlib fallback from HTMLParser import HTMLParser import sys _bs4_available = False _use_bs4 = True try: from bs4 import BeautifulSoup, FeatureNotFound _bs4_available = True except ImportError: _bs4_available = False class Tag(): def __init__(self, name, attrs): self.name = name self.attrs = dict(attrs) def __getitem__(self, key): return self.attrs.get(key) def get(self, key, default=None): return self.attrs.get(key, default) class SoupResult(): def __init__(self, tags): self._tags = tags self.body = self def find_all(self, tag_name, **kwargs): results = [] for tag in self._tags: if tag.name != tag_name: continue if 'href' in kwargs: if kwargs['href'] is True and 'href' not in tag.attrs: continue elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']: continue results.append(tag) return results class LinkExtractor(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.tags = [] def handle_starttag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def handle_startendtag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def _parse_stdlib(html): parser = LinkExtractor() try: parser.feed(html) except: pass return SoupResult(parser.tags) def _parse_bs4(html): try: return BeautifulSoup(html, 'lxml') except (FeatureNotFound, Exception): return BeautifulSoup(html, 'html.parser') def set_nobs(enabled): global _use_bs4 _use_bs4 = not enabled if enabled and _bs4_available: sys.stderr.write('info: --nobs: using stdlib HTMLParser\n') elif not _bs4_available: sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n') def soupify(html, nohtml=False): htm = html if nohtml else '%s' % (html) if _use_bs4 and _bs4_available: return _parse_bs4(htm) else: return _parse_stdlib(htm) def is_available(): return _bs4_available