ppf/soup_parser.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""

from HTMLParser import HTMLParser
import sys

_bs4_available = False
_use_bs4 = True

try:
    from bs4 import BeautifulSoup, FeatureNotFound
    _bs4_available = True
except ImportError:
    _bs4_available = False


class Tag(object):
    """Simple tag representation for stdlib parser."""

    def __init__(self, name, attrs):
        self.name = name
        self.attrs = dict(attrs)

    def __getitem__(self, key):
        return self.attrs.get(key)

    def get(self, key, default=None):
        return self.attrs.get(key, default)


class SoupResult(object):
    """BeautifulSoup-like result wrapper for stdlib parser."""

    def __init__(self, tags):
        self._tags = tags
        self.body = self

    def find_all(self, tag_name, **kwargs):
        """Find all tags matching criteria."""
        results = []
        for tag in self._tags:
            if tag.name != tag_name:
                continue
            if 'href' in kwargs:
                if kwargs['href'] is True and 'href' not in tag.attrs:
                    continue
                elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
                    continue
            results.append(tag)
        return results


class LinkExtractor(HTMLParser):
    """Extract tags from HTML using stdlib."""

    def __init__(self):
        HTMLParser.__init__(self)
        self.tags = []

    def handle_starttag(self, tag, attrs):
        self.tags.append(Tag(tag, attrs))

    def handle_startendtag(self, tag, attrs):
        self.tags.append(Tag(tag, attrs))


def _parse_stdlib(html):
    """Parse HTML using stdlib HTMLParser."""
    parser = LinkExtractor()
    try:
        parser.feed(html)
    except Exception:
        pass  # Malformed HTML, return partial results
    return SoupResult(parser.tags)


def _parse_bs4(html):
    """Parse HTML using BeautifulSoup."""
    try:
        return BeautifulSoup(html, 'lxml')
    except (FeatureNotFound, Exception):
        return BeautifulSoup(html, 'html.parser')


def set_nobs(enabled):
    """Disable BeautifulSoup and use stdlib instead."""
    global _use_bs4
    _use_bs4 = not enabled
    if enabled and _bs4_available:
        sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
    elif not _bs4_available:
        sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')


def soupify(html, nohtml=False):
    """Parse HTML content, returning BeautifulSoup-like object."""
    htm = html if nohtml else '<html><body>%s</body></html>' % html
    if _use_bs4 and _bs4_available:
        return _parse_bs4(htm)
    return _parse_stdlib(htm)


def is_available():
    """Check if BeautifulSoup is available."""
    return _bs4_available