style: normalize indentation and improve code style
- convert tabs to 4-space indentation
- add docstrings to modules and classes
- remove unused import (copy)
- use explicit object inheritance
- use 'while True' over 'while 1'
- use 'while args' over 'while len(args)'
- use '{}' over 'dict()'
- consistent string formatting
- Python 2/3 compatible Queue import
This commit is contained in:
126
soup_parser.py
126
soup_parser.py
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
# HTML parsing with optional BeautifulSoup or stdlib fallback
|
||||
# -*- coding: utf-8 -*-
|
||||
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
import sys
|
||||
@@ -8,87 +9,98 @@ _bs4_available = False
|
||||
_use_bs4 = True
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup, FeatureNotFound
|
||||
_bs4_available = True
|
||||
from bs4 import BeautifulSoup, FeatureNotFound
|
||||
_bs4_available = True
|
||||
except ImportError:
|
||||
_bs4_available = False
|
||||
_bs4_available = False
|
||||
|
||||
|
||||
class Tag():
|
||||
def __init__(self, name, attrs):
|
||||
self.name = name
|
||||
self.attrs = dict(attrs)
|
||||
class Tag(object):
|
||||
"""Simple tag representation for stdlib parser."""
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.attrs.get(key)
|
||||
def __init__(self, name, attrs):
|
||||
self.name = name
|
||||
self.attrs = dict(attrs)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.attrs.get(key, default)
|
||||
def __getitem__(self, key):
|
||||
return self.attrs.get(key)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.attrs.get(key, default)
|
||||
|
||||
|
||||
class SoupResult():
|
||||
def __init__(self, tags):
|
||||
self._tags = tags
|
||||
self.body = self
|
||||
class SoupResult(object):
|
||||
"""BeautifulSoup-like result wrapper for stdlib parser."""
|
||||
|
||||
def find_all(self, tag_name, **kwargs):
|
||||
results = []
|
||||
for tag in self._tags:
|
||||
if tag.name != tag_name:
|
||||
continue
|
||||
if 'href' in kwargs:
|
||||
if kwargs['href'] is True and 'href' not in tag.attrs:
|
||||
continue
|
||||
elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
|
||||
continue
|
||||
results.append(tag)
|
||||
return results
|
||||
def __init__(self, tags):
|
||||
self._tags = tags
|
||||
self.body = self
|
||||
|
||||
def find_all(self, tag_name, **kwargs):
|
||||
"""Find all tags matching criteria."""
|
||||
results = []
|
||||
for tag in self._tags:
|
||||
if tag.name != tag_name:
|
||||
continue
|
||||
if 'href' in kwargs:
|
||||
if kwargs['href'] is True and 'href' not in tag.attrs:
|
||||
continue
|
||||
elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
|
||||
continue
|
||||
results.append(tag)
|
||||
return results
|
||||
|
||||
|
||||
class LinkExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.tags = []
|
||||
"""Extract tags from HTML using stdlib."""
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.tags.append(Tag(tag, attrs))
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.tags = []
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
self.tags.append(Tag(tag, attrs))
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.tags.append(Tag(tag, attrs))
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
self.tags.append(Tag(tag, attrs))
|
||||
|
||||
|
||||
def _parse_stdlib(html):
|
||||
parser = LinkExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception:
|
||||
pass # malformed HTML, return partial results
|
||||
return SoupResult(parser.tags)
|
||||
"""Parse HTML using stdlib HTMLParser."""
|
||||
parser = LinkExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception:
|
||||
pass # Malformed HTML, return partial results
|
||||
return SoupResult(parser.tags)
|
||||
|
||||
|
||||
def _parse_bs4(html):
|
||||
try:
|
||||
return BeautifulSoup(html, 'lxml')
|
||||
except (FeatureNotFound, Exception):
|
||||
return BeautifulSoup(html, 'html.parser')
|
||||
"""Parse HTML using BeautifulSoup."""
|
||||
try:
|
||||
return BeautifulSoup(html, 'lxml')
|
||||
except (FeatureNotFound, Exception):
|
||||
return BeautifulSoup(html, 'html.parser')
|
||||
|
||||
|
||||
def set_nobs(enabled):
|
||||
global _use_bs4
|
||||
_use_bs4 = not enabled
|
||||
if enabled and _bs4_available:
|
||||
sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
|
||||
elif not _bs4_available:
|
||||
sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
|
||||
"""Disable BeautifulSoup and use stdlib instead."""
|
||||
global _use_bs4
|
||||
_use_bs4 = not enabled
|
||||
if enabled and _bs4_available:
|
||||
sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
|
||||
elif not _bs4_available:
|
||||
sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
|
||||
|
||||
|
||||
def soupify(html, nohtml=False):
|
||||
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
||||
if _use_bs4 and _bs4_available:
|
||||
return _parse_bs4(htm)
|
||||
else:
|
||||
return _parse_stdlib(htm)
|
||||
"""Parse HTML content, returning BeautifulSoup-like object."""
|
||||
htm = html if nohtml else '<html><body>%s</body></html>' % html
|
||||
if _use_bs4 and _bs4_available:
|
||||
return _parse_bs4(htm)
|
||||
return _parse_stdlib(htm)
|
||||
|
||||
|
||||
def is_available():
|
||||
return _bs4_available
|
||||
"""Check if BeautifulSoup is available."""
|
||||
return _bs4_available
|
||||
|
||||
Reference in New Issue
Block a user