diff --git a/soup_parser.py b/soup_parser.py index ee93d2b..daeae0a 100644 --- a/soup_parser.py +++ b/soup_parser.py @@ -1,8 +1,105 @@ -from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound +"""HTML parsing with optional BeautifulSoup or stdlib fallback.""" + +from HTMLParser import HTMLParser +import sys + +# Track if bs4 is available +_bs4_available = False +_use_bs4 = True + +try: + from bs4 import BeautifulSoup, FeatureNotFound + _bs4_available = True +except ImportError: + _bs4_available = False + + +class Tag(object): + """Minimal BeautifulSoup Tag interface.""" + def __init__(self, name, attrs): + self.name = name + self.attrs = dict(attrs) + + def __getitem__(self, key): + return self.attrs.get(key) + + def get(self, key, default=None): + return self.attrs.get(key, default) + + +class SoupResult(object): + """Minimal BeautifulSoup result interface.""" + def __init__(self, tags): + self._tags = tags + self.body = self # self-reference for soup.body.find_all() + + def find_all(self, tag_name, **kwargs): + """Find all tags matching criteria.""" + results = [] + for tag in self._tags: + if tag.name != tag_name: + continue + # check href=True means "has href attribute" + if 'href' in kwargs: + if kwargs['href'] is True and 'href' not in tag.attrs: + continue + elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']: + continue + results.append(tag) + return results + + +class LinkExtractor(HTMLParser): + """HTMLParser-based link extractor.""" + def __init__(self): + HTMLParser.__init__(self) + self.tags = [] + + def handle_starttag(self, tag, attrs): + self.tags.append(Tag(tag, attrs)) + + def handle_startendtag(self, tag, attrs): + self.tags.append(Tag(tag, attrs)) + + +def _parse_stdlib(html): + """Parse HTML using stdlib HTMLParser.""" + parser = LinkExtractor() + try: + parser.feed(html) + except: + pass # tolerate malformed HTML + return SoupResult(parser.tags) + + +def _parse_bs4(html): + """Parse HTML using BeautifulSoup.""" + try: + return BeautifulSoup(html, 'lxml') + except (FeatureNotFound, Exception): + return BeautifulSoup(html, 'html.parser') + + +def set_nobs(enabled): + """Disable BeautifulSoup usage.""" + global _use_bs4 + _use_bs4 = not enabled + if enabled and _bs4_available: + sys.stderr.write('info: --nobs: using stdlib HTMLParser\n') + elif not _bs4_available: + sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n') + def soupify(html, nohtml=False): - htm = html if nohtml else '%s'%(html) - try: - return BeautifulSoup(htm, 'lxml') - except FeatureNotFound: - return BeautifulSoup(htm, 'html.parser') + """Parse HTML and return soup-like object.""" + htm = html if nohtml else '%s' % (html) + + if _use_bs4 and _bs4_available: + return _parse_bs4(htm) + else: + return _parse_stdlib(htm) + + +def is_available(): + """Check if BeautifulSoup is available.""" + return _bs4_available