style: normalize indentation and improve code style
- convert tabs to 4-space indentation
- add docstrings to modules and classes
- remove unused import (copy)
- use explicit object inheritance
- use 'while True' over 'while 1'
- use 'while args' over 'while len(args)'
- use '{}' over 'dict()'
- consistent string formatting
- Python 2/3 compatible Queue import
This commit is contained in:
@@ -1,22 +1,36 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Combined config file and argument parser."""
|
||||||
|
|
||||||
from ConfigParser import SafeConfigParser, NoOptionError
|
from ConfigParser import SafeConfigParser, NoOptionError
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class _Dummy():
|
|
||||||
|
class _Dummy(object):
|
||||||
|
"""Placeholder for config sections."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ComboParser(object):
|
class ComboParser(object):
|
||||||
|
"""Parse configuration from INI file and command-line arguments.
|
||||||
|
|
||||||
|
Command-line arguments override INI file values.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, ini):
|
def __init__(self, ini):
|
||||||
self.items = []
|
self.items = []
|
||||||
self.cparser = SafeConfigParser()
|
self.cparser = SafeConfigParser()
|
||||||
self.aparser = ArgumentParser()
|
self.aparser = ArgumentParser()
|
||||||
self.ini = ini
|
self.ini = ini
|
||||||
self.items = []
|
|
||||||
self.loaded = False
|
self.loaded = False
|
||||||
|
self.args = None
|
||||||
|
|
||||||
def add_item(self, section, name, type, default, desc, required):
|
def add_item(self, section, name, type, default, desc, required):
|
||||||
|
"""Add a configuration item."""
|
||||||
def str2bool(val):
|
def str2bool(val):
|
||||||
return val in ['True', 'true', '1', 'yes']
|
return val.lower() in ('true', '1', 'yes')
|
||||||
|
|
||||||
self.items.append({
|
self.items.append({
|
||||||
'section': section,
|
'section': section,
|
||||||
'name': name,
|
'name': name,
|
||||||
@@ -26,49 +40,71 @@ class ComboParser(object):
|
|||||||
})
|
})
|
||||||
self.aparser.add_argument(
|
self.aparser.add_argument(
|
||||||
'--%s.%s' % (section, name),
|
'--%s.%s' % (section, name),
|
||||||
help='%s, default: (%s)'%(desc, str(default)),
|
help='%s (default: %s)' % (desc, default),
|
||||||
type=type if type is not bool else str2bool,
|
type=type if type is not bool else str2bool,
|
||||||
default=None,
|
default=None,
|
||||||
required=False
|
required=False
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
if self.loaded: return
|
"""Load configuration from file and command-line."""
|
||||||
|
if self.loaded:
|
||||||
|
return
|
||||||
self.loaded = True
|
self.loaded = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.cparser.read(self.ini)
|
self.cparser.read(self.ini)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # config file missing or unreadable, use defaults
|
pass # Config file missing or unreadable, use defaults
|
||||||
args = self.aparser.parse_args()
|
|
||||||
|
self.args = self.aparser.parse_args()
|
||||||
|
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
try:
|
section = item['section']
|
||||||
obj = getattr(self, item['section'])
|
name = item['name']
|
||||||
except AttributeError:
|
|
||||||
setattr(self, item['section'], _Dummy())
|
|
||||||
obj = getattr(self, item['section'])
|
|
||||||
|
|
||||||
setattr(obj, item['name'], item['default'])
|
# Ensure section object exists
|
||||||
inner = getattr(obj, item['name'])
|
if not hasattr(self, section):
|
||||||
|
setattr(self, section, _Dummy())
|
||||||
|
obj = getattr(self, section)
|
||||||
|
|
||||||
item['found'] = True
|
# Start with default value
|
||||||
|
value = item['default']
|
||||||
|
found = False
|
||||||
|
|
||||||
|
# Try to read from config file
|
||||||
try:
|
try:
|
||||||
if item['type'] is bool : inner = self.cparser.getboolean(item['section'], item['name'])
|
if item['type'] is bool:
|
||||||
elif item['type'] is float: inner = self.cparser.getfloat(item['section'], item['name'])
|
value = self.cparser.getboolean(section, name)
|
||||||
elif item['type'] is int : inner = self.cparser.getint(item['section'], item['name'])
|
elif item['type'] is float:
|
||||||
elif item['type'] is str : inner = self.cparser.get(item['section'], item['name'])
|
value = self.cparser.getfloat(section, name)
|
||||||
|
elif item['type'] is int:
|
||||||
|
value = self.cparser.getint(section, name)
|
||||||
|
elif item['type'] is str:
|
||||||
|
value = self.cparser.get(section, name)
|
||||||
|
found = True
|
||||||
except NoOptionError:
|
except NoOptionError:
|
||||||
item['found'] = False
|
pass
|
||||||
try:
|
|
||||||
arg = getattr(args, '%s.%s'%(item['section'], item['name']))
|
# Command-line overrides config file
|
||||||
if arg is not None:
|
arg_name = '%s.%s' % (section, name)
|
||||||
inner = arg
|
arg_value = getattr(self.args, arg_name, None)
|
||||||
item['found'] = True
|
if arg_value is not None:
|
||||||
except AttributeError:
|
value = arg_value
|
||||||
pass # arg not provided on command line
|
found = True
|
||||||
if not item['found']:
|
|
||||||
|
# Handle missing required items
|
||||||
|
if not found:
|
||||||
if item['required']:
|
if item['required']:
|
||||||
sys.stderr.write('error: required config item "%s" not found in section "%s" of "%s"!\n'%(item['name'], item['section'], self.ini))
|
sys.stderr.write(
|
||||||
|
'error: required config item "%s" not found in section "%s" of "%s"\n'
|
||||||
|
% (name, section, self.ini)
|
||||||
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
sys.stderr.write('warning: assigned default value of "%s" to "%s.%s"\n'%(str(item['default']), item['section'], item['name']))
|
sys.stderr.write(
|
||||||
setattr(obj, item['name'], inner)
|
'warning: assigned default value of "%s" to "%s.%s"\n'
|
||||||
|
% (item['default'], section, name)
|
||||||
|
)
|
||||||
|
|
||||||
|
setattr(obj, name, value)
|
||||||
|
|||||||
41
dbs.py
41
dbs.py
@@ -1,7 +1,13 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Database table creation and insertion utilities."""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from misc import _log
|
from misc import _log
|
||||||
|
|
||||||
|
|
||||||
def create_table_if_not_exists(sqlite, dbname):
|
def create_table_if_not_exists(sqlite, dbname):
|
||||||
|
"""Create database table with indexes if it doesn't exist."""
|
||||||
if dbname == 'proxylist':
|
if dbname == 'proxylist':
|
||||||
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
||||||
proxy BLOB UNIQUE,
|
proxy BLOB UNIQUE,
|
||||||
@@ -17,7 +23,7 @@ def create_table_if_not_exists(sqlite, dbname):
|
|||||||
port INT,
|
port INT,
|
||||||
consecutive_success INT,
|
consecutive_success INT,
|
||||||
total_duration INT)""")
|
total_duration INT)""")
|
||||||
# indexes for common query patterns
|
# Indexes for common query patterns
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
||||||
@@ -31,31 +37,44 @@ def create_table_if_not_exists(sqlite, dbname):
|
|||||||
stale_count INT,
|
stale_count INT,
|
||||||
retrievals INT,
|
retrievals INT,
|
||||||
proxies_added INT,
|
proxies_added INT,
|
||||||
added INT
|
added INT)""")
|
||||||
)""")
|
# Indexes for common query patterns
|
||||||
# indexes for common query patterns
|
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
||||||
|
|
||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
|
|
||||||
|
|
||||||
def insert_proxies(proxydb, proxies, url):
|
def insert_proxies(proxydb, proxies, url):
|
||||||
if not proxies: return
|
"""Insert new proxies into database."""
|
||||||
|
if not proxies:
|
||||||
|
return
|
||||||
timestamp = int(time.time())
|
timestamp = int(time.time())
|
||||||
rows = []
|
rows = []
|
||||||
for p in proxies:
|
for p in proxies:
|
||||||
ip, port = p.split(':')
|
ip, port = p.split(':')
|
||||||
rows.append((timestamp, p, ip, port, 3, 0, 0, 0, 0, 0))
|
rows.append((timestamp, p, ip, port, 3, 0, 0, 0, 0, 0))
|
||||||
proxydb.executemany('INSERT OR IGNORE INTO proxylist (added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) VALUES (?,?,?,?,?,?,?,?,?,?)', rows)
|
proxydb.executemany(
|
||||||
|
'INSERT OR IGNORE INTO proxylist '
|
||||||
|
'(added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) '
|
||||||
|
'VALUES (?,?,?,?,?,?,?,?,?,?)',
|
||||||
|
rows
|
||||||
|
)
|
||||||
proxydb.commit()
|
proxydb.commit()
|
||||||
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
||||||
|
|
||||||
|
|
||||||
def insert_urls(urls, search, sqlite):
|
def insert_urls(urls, search, sqlite):
|
||||||
if not urls: return
|
"""Insert new URLs into database."""
|
||||||
time_now = int(time.time())
|
if not urls:
|
||||||
rows = [ (time_now,u,0,1,0,0,0) for u in urls ]
|
return
|
||||||
sqlite.executemany('INSERT OR IGNORE INTO uris (added,url,check_time,error,stale_count,retrievals,proxies_added) values(?,?,?,?,?,?,?)', rows)
|
timestamp = int(time.time())
|
||||||
|
rows = [(timestamp, u, 0, 1, 0, 0, 0) for u in urls]
|
||||||
|
sqlite.executemany(
|
||||||
|
'INSERT OR IGNORE INTO uris '
|
||||||
|
'(added,url,check_time,error,stale_count,retrievals,proxies_added) '
|
||||||
|
'VALUES (?,?,?,?,?,?,?)',
|
||||||
|
rows
|
||||||
|
)
|
||||||
sqlite.commit()
|
sqlite.commit()
|
||||||
_log('+%d url(s) from %s' % (len(urls), search), 'added')
|
_log('+%d url(s) from %s' % (len(urls), search), 'added')
|
||||||
|
|
||||||
|
|||||||
54
mysqlite.py
54
mysqlite.py
@@ -1,9 +1,29 @@
|
|||||||
import time, random, sys
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""SQLite wrapper with retry logic and WAL mode."""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
class mysqlite:
|
|
||||||
|
class mysqlite(object):
|
||||||
|
"""SQLite connection wrapper with automatic retry on lock."""
|
||||||
|
|
||||||
|
def __init__(self, database, factory=None):
|
||||||
|
self.handle = sqlite3.connect(database)
|
||||||
|
if factory is not None:
|
||||||
|
self.handle.text_factory = factory
|
||||||
|
self.cursor = self.handle.cursor()
|
||||||
|
self.dbname = database
|
||||||
|
# Enable WAL mode for better concurrency
|
||||||
|
self.cursor.execute('PRAGMA journal_mode=WAL')
|
||||||
|
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
||||||
|
|
||||||
def _try_op(self, op, query, args=None, rmin=1.5, rmax=7.0):
|
def _try_op(self, op, query, args=None, rmin=1.5, rmax=7.0):
|
||||||
while 1:
|
"""Execute operation with retry on database lock."""
|
||||||
|
while True:
|
||||||
try:
|
try:
|
||||||
if query is None:
|
if query is None:
|
||||||
return op()
|
return op()
|
||||||
@@ -12,33 +32,31 @@ class mysqlite:
|
|||||||
else:
|
else:
|
||||||
return op(query, args)
|
return op(query, args)
|
||||||
except sqlite3.OperationalError as e:
|
except sqlite3.OperationalError as e:
|
||||||
if e.message == 'database is locked':
|
err_msg = str(e)
|
||||||
print "zzZzzZZ: db is locked (%s)"%self.dbname
|
if 'database is locked' in err_msg:
|
||||||
|
sys.stderr.write('zzZzzZZ: db is locked (%s)\n' % self.dbname)
|
||||||
time.sleep(random.uniform(rmin, rmax))
|
time.sleep(random.uniform(rmin, rmax))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print '%s\nquery: %s\nargs: %s' % (str(sys.exc_info()), str(query), str(args))
|
sys.stderr.write('%s\nquery: %s\nargs: %s\n' % (
|
||||||
raise e
|
str(sys.exc_info()), str(query), str(args)))
|
||||||
|
raise
|
||||||
|
|
||||||
def execute(self, query, args=None, rmin=1.5, rmax=7.0):
|
def execute(self, query, args=None, rmin=1.5, rmax=7.0):
|
||||||
|
"""Execute a single query with retry."""
|
||||||
return self._try_op(self.cursor.execute, query, args, rmin, rmax)
|
return self._try_op(self.cursor.execute, query, args, rmin, rmax)
|
||||||
|
|
||||||
def executemany(self, query, args, rmin=1.5, rmax=7.0):
|
def executemany(self, query, args, rmin=1.5, rmax=7.0):
|
||||||
while len(args):
|
"""Execute query for multiple argument sets, batched."""
|
||||||
self._try_op(self.cursor.executemany, query, args[:500], rmin, rmax)
|
while args:
|
||||||
|
batch = args[:500]
|
||||||
|
self._try_op(self.cursor.executemany, query, batch, rmin, rmax)
|
||||||
args = args[500:]
|
args = args[500:]
|
||||||
|
|
||||||
def commit(self, rmin=1.5, rmax=7.0):
|
def commit(self, rmin=1.5, rmax=7.0):
|
||||||
|
"""Commit transaction with retry."""
|
||||||
return self._try_op(self.handle.commit, None, None, rmin, rmax)
|
return self._try_op(self.handle.commit, None, None, rmin, rmax)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
"""Close database connection."""
|
||||||
self.handle.close()
|
self.handle.close()
|
||||||
|
|
||||||
def __init__(self, database, factory = None):
|
|
||||||
self.handle = sqlite3.connect(database)
|
|
||||||
if factory: self.handle.text_factory = factory
|
|
||||||
self.cursor = self.handle.cursor()
|
|
||||||
self.dbname = database
|
|
||||||
# enable WAL mode for better concurrency
|
|
||||||
self.cursor.execute('PRAGMA journal_mode=WAL')
|
|
||||||
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
|
||||||
|
|||||||
@@ -1,9 +1,16 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
import time, random, string, re, copy
|
import time
|
||||||
import Queue
|
import random
|
||||||
|
import string
|
||||||
|
import re
|
||||||
import heapq
|
import heapq
|
||||||
|
|
||||||
|
try:
|
||||||
|
import Queue
|
||||||
|
except ImportError:
|
||||||
|
import queue as Queue
|
||||||
try:
|
try:
|
||||||
import IP2Location
|
import IP2Location
|
||||||
import os
|
import os
|
||||||
@@ -22,7 +29,7 @@ import connection_pool
|
|||||||
config = Config()
|
config = Config()
|
||||||
|
|
||||||
_run_standalone = False
|
_run_standalone = False
|
||||||
cached_dns = dict()
|
cached_dns = {}
|
||||||
|
|
||||||
regexes = {
|
regexes = {
|
||||||
'www.facebook.com': 'X-FB-Debug',
|
'www.facebook.com': 'X-FB-Debug',
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# HTML parsing with optional BeautifulSoup or stdlib fallback
|
# -*- coding: utf-8 -*-
|
||||||
|
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
import sys
|
import sys
|
||||||
@@ -14,7 +15,9 @@ except ImportError:
|
|||||||
_bs4_available = False
|
_bs4_available = False
|
||||||
|
|
||||||
|
|
||||||
class Tag():
|
class Tag(object):
|
||||||
|
"""Simple tag representation for stdlib parser."""
|
||||||
|
|
||||||
def __init__(self, name, attrs):
|
def __init__(self, name, attrs):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.attrs = dict(attrs)
|
self.attrs = dict(attrs)
|
||||||
@@ -26,12 +29,15 @@ class Tag():
|
|||||||
return self.attrs.get(key, default)
|
return self.attrs.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
class SoupResult():
|
class SoupResult(object):
|
||||||
|
"""BeautifulSoup-like result wrapper for stdlib parser."""
|
||||||
|
|
||||||
def __init__(self, tags):
|
def __init__(self, tags):
|
||||||
self._tags = tags
|
self._tags = tags
|
||||||
self.body = self
|
self.body = self
|
||||||
|
|
||||||
def find_all(self, tag_name, **kwargs):
|
def find_all(self, tag_name, **kwargs):
|
||||||
|
"""Find all tags matching criteria."""
|
||||||
results = []
|
results = []
|
||||||
for tag in self._tags:
|
for tag in self._tags:
|
||||||
if tag.name != tag_name:
|
if tag.name != tag_name:
|
||||||
@@ -46,6 +52,8 @@ class SoupResult():
|
|||||||
|
|
||||||
|
|
||||||
class LinkExtractor(HTMLParser):
|
class LinkExtractor(HTMLParser):
|
||||||
|
"""Extract tags from HTML using stdlib."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.tags = []
|
self.tags = []
|
||||||
@@ -58,15 +66,17 @@ class LinkExtractor(HTMLParser):
|
|||||||
|
|
||||||
|
|
||||||
def _parse_stdlib(html):
|
def _parse_stdlib(html):
|
||||||
|
"""Parse HTML using stdlib HTMLParser."""
|
||||||
parser = LinkExtractor()
|
parser = LinkExtractor()
|
||||||
try:
|
try:
|
||||||
parser.feed(html)
|
parser.feed(html)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # malformed HTML, return partial results
|
pass # Malformed HTML, return partial results
|
||||||
return SoupResult(parser.tags)
|
return SoupResult(parser.tags)
|
||||||
|
|
||||||
|
|
||||||
def _parse_bs4(html):
|
def _parse_bs4(html):
|
||||||
|
"""Parse HTML using BeautifulSoup."""
|
||||||
try:
|
try:
|
||||||
return BeautifulSoup(html, 'lxml')
|
return BeautifulSoup(html, 'lxml')
|
||||||
except (FeatureNotFound, Exception):
|
except (FeatureNotFound, Exception):
|
||||||
@@ -74,6 +84,7 @@ def _parse_bs4(html):
|
|||||||
|
|
||||||
|
|
||||||
def set_nobs(enabled):
|
def set_nobs(enabled):
|
||||||
|
"""Disable BeautifulSoup and use stdlib instead."""
|
||||||
global _use_bs4
|
global _use_bs4
|
||||||
_use_bs4 = not enabled
|
_use_bs4 = not enabled
|
||||||
if enabled and _bs4_available:
|
if enabled and _bs4_available:
|
||||||
@@ -83,12 +94,13 @@ def set_nobs(enabled):
|
|||||||
|
|
||||||
|
|
||||||
def soupify(html, nohtml=False):
|
def soupify(html, nohtml=False):
|
||||||
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
"""Parse HTML content, returning BeautifulSoup-like object."""
|
||||||
|
htm = html if nohtml else '<html><body>%s</body></html>' % html
|
||||||
if _use_bs4 and _bs4_available:
|
if _use_bs4 and _bs4_available:
|
||||||
return _parse_bs4(htm)
|
return _parse_bs4(htm)
|
||||||
else:
|
|
||||||
return _parse_stdlib(htm)
|
return _parse_stdlib(htm)
|
||||||
|
|
||||||
|
|
||||||
def is_available():
|
def is_available():
|
||||||
|
"""Check if BeautifulSoup is available."""
|
||||||
return _bs4_available
|
return _bs4_available
|
||||||
|
|||||||
Reference in New Issue
Block a user