#!/usr/bin/env python2 # -*- coding: utf-8 -*- """HTML parsing with optional BeautifulSoup or stdlib fallback.""" from HTMLParser import HTMLParser import sys _bs4_available = False _use_bs4 = True try: from bs4 import BeautifulSoup, FeatureNotFound _bs4_available = True except ImportError: _bs4_available = False class Tag(object): """Simple tag representation for stdlib parser.""" def __init__(self, name, attrs): self.name = name self.attrs = dict(attrs) def __getitem__(self, key): return self.attrs.get(key) def get(self, key, default=None): return self.attrs.get(key, default) class SoupResult(object): """BeautifulSoup-like result wrapper for stdlib parser.""" def __init__(self, tags): self._tags = tags self.body = self def find_all(self, tag_name, **kwargs): """Find all tags matching criteria.""" results = [] for tag in self._tags: if tag.name != tag_name: continue if 'href' in kwargs: if kwargs['href'] is True and 'href' not in tag.attrs: continue elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']: continue results.append(tag) return results class LinkExtractor(HTMLParser): """Extract tags from HTML using stdlib.""" def __init__(self): HTMLParser.__init__(self) self.tags = [] def handle_starttag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def handle_startendtag(self, tag, attrs): self.tags.append(Tag(tag, attrs)) def _parse_stdlib(html): """Parse HTML using stdlib HTMLParser.""" parser = LinkExtractor() try: parser.feed(html) except Exception: pass # Malformed HTML, return partial results return SoupResult(parser.tags) def _parse_bs4(html): """Parse HTML using BeautifulSoup.""" try: return BeautifulSoup(html, 'lxml') except (FeatureNotFound, Exception): return BeautifulSoup(html, 'html.parser') def set_nobs(enabled): """Disable BeautifulSoup and use stdlib instead.""" global _use_bs4 _use_bs4 = not enabled if enabled and _bs4_available: sys.stderr.write('info: --nobs: using stdlib HTMLParser\n') elif not _bs4_available: sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n') def soupify(html, nohtml=False): """Parse HTML content, returning BeautifulSoup-like object.""" htm = html if nohtml else '%s' % html if _use_bs4 and _bs4_available: return _parse_bs4(htm) return _parse_stdlib(htm) def is_available(): """Check if BeautifulSoup is available.""" return _bs4_available