diff --git a/misc.py b/misc.py index 87da059..38e7e3e 100644 --- a/misc.py +++ b/misc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 import time, sys diff --git a/proxywatchd.py b/proxywatchd.py index b33db0d..b5cd105 100644 --- a/proxywatchd.py +++ b/proxywatchd.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 import threading import time, random, string, re, copy diff --git a/scraper.py b/scraper.py index 36d5dd1..f50ecd7 100755 --- a/scraper.py +++ b/scraper.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 import dbs import random, time diff --git a/soup_parser.py b/soup_parser.py index daeae0a..ead73f0 100644 --- a/soup_parser.py +++ b/soup_parser.py @@ -1,9 +1,9 @@ -"""HTML parsing with optional BeautifulSoup or stdlib fallback.""" +#!/usr/bin/env python2 +# HTML parsing with optional BeautifulSoup or stdlib fallback from HTMLParser import HTMLParser import sys -# Track if bs4 is available _bs4_available = False _use_bs4 = True @@ -14,8 +14,7 @@ except ImportError: _bs4_available = False -class Tag(object): - """Minimal BeautifulSoup Tag interface.""" +class Tag(): def __init__(self, name, attrs): self.name = name self.attrs = dict(attrs) @@ -27,19 +26,16 @@ class Tag(object): return self.attrs.get(key, default) -class SoupResult(object): - """Minimal BeautifulSoup result interface.""" +class SoupResult(): def __init__(self, tags): self._tags = tags - self.body = self # self-reference for soup.body.find_all() + self.body = self def find_all(self, tag_name, **kwargs): - """Find all tags matching criteria.""" results = [] for tag in self._tags: if tag.name != tag_name: continue - # check href=True means "has href attribute" if 'href' in kwargs: if kwargs['href'] is True and 'href' not in tag.attrs: continue @@ -50,7 +46,6 @@ class SoupResult(object): class LinkExtractor(HTMLParser): - """HTMLParser-based link extractor.""" def __init__(self): HTMLParser.__init__(self) self.tags = [] @@ -63,17 +58,15 @@ class LinkExtractor(HTMLParser): def _parse_stdlib(html): - """Parse HTML using stdlib HTMLParser.""" parser = LinkExtractor() try: parser.feed(html) except: - pass # tolerate malformed HTML + pass return SoupResult(parser.tags) def _parse_bs4(html): - """Parse HTML using BeautifulSoup.""" try: return BeautifulSoup(html, 'lxml') except (FeatureNotFound, Exception): @@ -81,7 +74,6 @@ def _parse_bs4(html): def set_nobs(enabled): - """Disable BeautifulSoup usage.""" global _use_bs4 _use_bs4 = not enabled if enabled and _bs4_available: @@ -91,9 +83,7 @@ def set_nobs(enabled): def soupify(html, nohtml=False): - """Parse HTML and return soup-like object.""" htm = html if nohtml else '%s' % (html) - if _use_bs4 and _bs4_available: return _parse_bs4(htm) else: @@ -101,5 +91,4 @@ def soupify(html, nohtml=False): def is_available(): - """Check if BeautifulSoup is available.""" return _bs4_available diff --git a/test_nobs.py b/test_nobs.py index 2d169be..77f2b27 100644 --- a/test_nobs.py +++ b/test_nobs.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""Test --nobs functionality (stdlib HTML parsing without BeautifulSoup).""" +#!/usr/bin/env python2 +# test --nobs functionality (stdlib HTML parsing without BeautifulSoup) import sys