standardize code style: shebangs, class definitions, comments
This commit is contained in:
2
misc.py
2
misc.py
@@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import time, sys
|
import time, sys
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
import time, random, string, re, copy
|
import time, random, string, re, copy
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import dbs
|
import dbs
|
||||||
import random, time
|
import random, time
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
|
#!/usr/bin/env python2
|
||||||
|
# HTML parsing with optional BeautifulSoup or stdlib fallback
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# Track if bs4 is available
|
|
||||||
_bs4_available = False
|
_bs4_available = False
|
||||||
_use_bs4 = True
|
_use_bs4 = True
|
||||||
|
|
||||||
@@ -14,8 +14,7 @@ except ImportError:
|
|||||||
_bs4_available = False
|
_bs4_available = False
|
||||||
|
|
||||||
|
|
||||||
class Tag(object):
|
class Tag():
|
||||||
"""Minimal BeautifulSoup Tag interface."""
|
|
||||||
def __init__(self, name, attrs):
|
def __init__(self, name, attrs):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.attrs = dict(attrs)
|
self.attrs = dict(attrs)
|
||||||
@@ -27,19 +26,16 @@ class Tag(object):
|
|||||||
return self.attrs.get(key, default)
|
return self.attrs.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
class SoupResult(object):
|
class SoupResult():
|
||||||
"""Minimal BeautifulSoup result interface."""
|
|
||||||
def __init__(self, tags):
|
def __init__(self, tags):
|
||||||
self._tags = tags
|
self._tags = tags
|
||||||
self.body = self # self-reference for soup.body.find_all()
|
self.body = self
|
||||||
|
|
||||||
def find_all(self, tag_name, **kwargs):
|
def find_all(self, tag_name, **kwargs):
|
||||||
"""Find all tags matching criteria."""
|
|
||||||
results = []
|
results = []
|
||||||
for tag in self._tags:
|
for tag in self._tags:
|
||||||
if tag.name != tag_name:
|
if tag.name != tag_name:
|
||||||
continue
|
continue
|
||||||
# check href=True means "has href attribute"
|
|
||||||
if 'href' in kwargs:
|
if 'href' in kwargs:
|
||||||
if kwargs['href'] is True and 'href' not in tag.attrs:
|
if kwargs['href'] is True and 'href' not in tag.attrs:
|
||||||
continue
|
continue
|
||||||
@@ -50,7 +46,6 @@ class SoupResult(object):
|
|||||||
|
|
||||||
|
|
||||||
class LinkExtractor(HTMLParser):
|
class LinkExtractor(HTMLParser):
|
||||||
"""HTMLParser-based link extractor."""
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.tags = []
|
self.tags = []
|
||||||
@@ -63,17 +58,15 @@ class LinkExtractor(HTMLParser):
|
|||||||
|
|
||||||
|
|
||||||
def _parse_stdlib(html):
|
def _parse_stdlib(html):
|
||||||
"""Parse HTML using stdlib HTMLParser."""
|
|
||||||
parser = LinkExtractor()
|
parser = LinkExtractor()
|
||||||
try:
|
try:
|
||||||
parser.feed(html)
|
parser.feed(html)
|
||||||
except:
|
except:
|
||||||
pass # tolerate malformed HTML
|
pass
|
||||||
return SoupResult(parser.tags)
|
return SoupResult(parser.tags)
|
||||||
|
|
||||||
|
|
||||||
def _parse_bs4(html):
|
def _parse_bs4(html):
|
||||||
"""Parse HTML using BeautifulSoup."""
|
|
||||||
try:
|
try:
|
||||||
return BeautifulSoup(html, 'lxml')
|
return BeautifulSoup(html, 'lxml')
|
||||||
except (FeatureNotFound, Exception):
|
except (FeatureNotFound, Exception):
|
||||||
@@ -81,7 +74,6 @@ def _parse_bs4(html):
|
|||||||
|
|
||||||
|
|
||||||
def set_nobs(enabled):
|
def set_nobs(enabled):
|
||||||
"""Disable BeautifulSoup usage."""
|
|
||||||
global _use_bs4
|
global _use_bs4
|
||||||
_use_bs4 = not enabled
|
_use_bs4 = not enabled
|
||||||
if enabled and _bs4_available:
|
if enabled and _bs4_available:
|
||||||
@@ -91,9 +83,7 @@ def set_nobs(enabled):
|
|||||||
|
|
||||||
|
|
||||||
def soupify(html, nohtml=False):
|
def soupify(html, nohtml=False):
|
||||||
"""Parse HTML and return soup-like object."""
|
|
||||||
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
||||||
|
|
||||||
if _use_bs4 and _bs4_available:
|
if _use_bs4 and _bs4_available:
|
||||||
return _parse_bs4(htm)
|
return _parse_bs4(htm)
|
||||||
else:
|
else:
|
||||||
@@ -101,5 +91,4 @@ def soupify(html, nohtml=False):
|
|||||||
|
|
||||||
|
|
||||||
def is_available():
|
def is_available():
|
||||||
"""Check if BeautifulSoup is available."""
|
|
||||||
return _bs4_available
|
return _bs4_available
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
# -*- coding: utf-8 -*-
|
# test --nobs functionality (stdlib HTML parsing without BeautifulSoup)
|
||||||
"""Test --nobs functionality (stdlib HTML parsing without BeautifulSoup)."""
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user