style: normalize indentation and improve code style
- convert tabs to 4-space indentation
- add docstrings to modules and classes
- remove unused import (copy)
- use explicit object inheritance
- use 'while True' over 'while 1'
- use 'while args' over 'while len(args)'
- use '{}' over 'dict()'
- consistent string formatting
- Python 2/3 compatible Queue import
This commit is contained in:
110
comboparse.py
110
comboparse.py
@@ -1,74 +1,110 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Combined config file and argument parser."""
|
||||
|
||||
from ConfigParser import SafeConfigParser, NoOptionError
|
||||
from argparse import ArgumentParser
|
||||
import sys
|
||||
|
||||
class _Dummy():
|
||||
|
||||
class _Dummy(object):
|
||||
"""Placeholder for config sections."""
|
||||
pass
|
||||
|
||||
|
||||
class ComboParser(object):
|
||||
"""Parse configuration from INI file and command-line arguments.
|
||||
|
||||
Command-line arguments override INI file values.
|
||||
"""
|
||||
|
||||
def __init__(self, ini):
|
||||
self.items = []
|
||||
self.cparser = SafeConfigParser()
|
||||
self.aparser = ArgumentParser()
|
||||
self.ini = ini
|
||||
self.items = []
|
||||
self.loaded = False
|
||||
self.args = None
|
||||
|
||||
def add_item(self, section, name, type, default, desc, required):
|
||||
"""Add a configuration item."""
|
||||
def str2bool(val):
|
||||
return val in ['True', 'true', '1', 'yes']
|
||||
return val.lower() in ('true', '1', 'yes')
|
||||
|
||||
self.items.append({
|
||||
'section':section,
|
||||
'name':name,
|
||||
'type':type,
|
||||
'default':default,
|
||||
'required':required,
|
||||
'section': section,
|
||||
'name': name,
|
||||
'type': type,
|
||||
'default': default,
|
||||
'required': required,
|
||||
})
|
||||
self.aparser.add_argument(
|
||||
'--%s.%s'%(section, name),
|
||||
help='%s, default: (%s)'%(desc, str(default)),
|
||||
'--%s.%s' % (section, name),
|
||||
help='%s (default: %s)' % (desc, default),
|
||||
type=type if type is not bool else str2bool,
|
||||
default=None,
|
||||
required=False
|
||||
)
|
||||
|
||||
def load(self):
|
||||
if self.loaded: return
|
||||
"""Load configuration from file and command-line."""
|
||||
if self.loaded:
|
||||
return
|
||||
self.loaded = True
|
||||
|
||||
try:
|
||||
self.cparser.read(self.ini)
|
||||
except Exception:
|
||||
pass # config file missing or unreadable, use defaults
|
||||
args = self.aparser.parse_args()
|
||||
pass # Config file missing or unreadable, use defaults
|
||||
|
||||
self.args = self.aparser.parse_args()
|
||||
|
||||
for item in self.items:
|
||||
try:
|
||||
obj = getattr(self, item['section'])
|
||||
except AttributeError:
|
||||
setattr(self, item['section'], _Dummy())
|
||||
obj = getattr(self, item['section'])
|
||||
section = item['section']
|
||||
name = item['name']
|
||||
|
||||
setattr(obj, item['name'], item['default'])
|
||||
inner = getattr(obj, item['name'])
|
||||
# Ensure section object exists
|
||||
if not hasattr(self, section):
|
||||
setattr(self, section, _Dummy())
|
||||
obj = getattr(self, section)
|
||||
|
||||
item['found'] = True
|
||||
# Start with default value
|
||||
value = item['default']
|
||||
found = False
|
||||
|
||||
# Try to read from config file
|
||||
try:
|
||||
if item['type'] is bool : inner = self.cparser.getboolean(item['section'], item['name'])
|
||||
elif item['type'] is float: inner = self.cparser.getfloat(item['section'], item['name'])
|
||||
elif item['type'] is int : inner = self.cparser.getint(item['section'], item['name'])
|
||||
elif item['type'] is str : inner = self.cparser.get(item['section'], item['name'])
|
||||
if item['type'] is bool:
|
||||
value = self.cparser.getboolean(section, name)
|
||||
elif item['type'] is float:
|
||||
value = self.cparser.getfloat(section, name)
|
||||
elif item['type'] is int:
|
||||
value = self.cparser.getint(section, name)
|
||||
elif item['type'] is str:
|
||||
value = self.cparser.get(section, name)
|
||||
found = True
|
||||
except NoOptionError:
|
||||
item['found'] = False
|
||||
try:
|
||||
arg = getattr(args, '%s.%s'%(item['section'], item['name']))
|
||||
if arg is not None:
|
||||
inner = arg
|
||||
item['found'] = True
|
||||
except AttributeError:
|
||||
pass # arg not provided on command line
|
||||
if not item['found']:
|
||||
pass
|
||||
|
||||
# Command-line overrides config file
|
||||
arg_name = '%s.%s' % (section, name)
|
||||
arg_value = getattr(self.args, arg_name, None)
|
||||
if arg_value is not None:
|
||||
value = arg_value
|
||||
found = True
|
||||
|
||||
# Handle missing required items
|
||||
if not found:
|
||||
if item['required']:
|
||||
sys.stderr.write('error: required config item "%s" not found in section "%s" of "%s"!\n'%(item['name'], item['section'], self.ini))
|
||||
sys.stderr.write(
|
||||
'error: required config item "%s" not found in section "%s" of "%s"\n'
|
||||
% (name, section, self.ini)
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.stderr.write('warning: assigned default value of "%s" to "%s.%s"\n'%(str(item['default']), item['section'], item['name']))
|
||||
setattr(obj, item['name'], inner)
|
||||
sys.stderr.write(
|
||||
'warning: assigned default value of "%s" to "%s.%s"\n'
|
||||
% (item['default'], section, name)
|
||||
)
|
||||
|
||||
setattr(obj, name, value)
|
||||
|
||||
43
dbs.py
43
dbs.py
@@ -1,7 +1,13 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Database table creation and insertion utilities."""
|
||||
|
||||
import time
|
||||
from misc import _log
|
||||
|
||||
|
||||
def create_table_if_not_exists(sqlite, dbname):
|
||||
"""Create database table with indexes if it doesn't exist."""
|
||||
if dbname == 'proxylist':
|
||||
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
||||
proxy BLOB UNIQUE,
|
||||
@@ -17,7 +23,7 @@ def create_table_if_not_exists(sqlite, dbname):
|
||||
port INT,
|
||||
consecutive_success INT,
|
||||
total_duration INT)""")
|
||||
# indexes for common query patterns
|
||||
# Indexes for common query patterns
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
||||
@@ -31,31 +37,44 @@ def create_table_if_not_exists(sqlite, dbname):
|
||||
stale_count INT,
|
||||
retrievals INT,
|
||||
proxies_added INT,
|
||||
added INT
|
||||
)""")
|
||||
# indexes for common query patterns
|
||||
added INT)""")
|
||||
# Indexes for common query patterns
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
||||
|
||||
sqlite.commit()
|
||||
|
||||
|
||||
def insert_proxies(proxydb, proxies, url):
|
||||
if not proxies: return
|
||||
"""Insert new proxies into database."""
|
||||
if not proxies:
|
||||
return
|
||||
timestamp = int(time.time())
|
||||
rows = []
|
||||
for p in proxies:
|
||||
ip, port = p.split(':')
|
||||
rows.append((timestamp,p,ip,port,3,0,0,0,0,0))
|
||||
proxydb.executemany('INSERT OR IGNORE INTO proxylist (added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) VALUES (?,?,?,?,?,?,?,?,?,?)', rows)
|
||||
rows.append((timestamp, p, ip, port, 3, 0, 0, 0, 0, 0))
|
||||
proxydb.executemany(
|
||||
'INSERT OR IGNORE INTO proxylist '
|
||||
'(added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) '
|
||||
'VALUES (?,?,?,?,?,?,?,?,?,?)',
|
||||
rows
|
||||
)
|
||||
proxydb.commit()
|
||||
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
||||
|
||||
|
||||
def insert_urls(urls, search, sqlite):
|
||||
if not urls: return
|
||||
time_now = int(time.time())
|
||||
rows = [ (time_now,u,0,1,0,0,0) for u in urls ]
|
||||
sqlite.executemany('INSERT OR IGNORE INTO uris (added,url,check_time,error,stale_count,retrievals,proxies_added) values(?,?,?,?,?,?,?)', rows)
|
||||
"""Insert new URLs into database."""
|
||||
if not urls:
|
||||
return
|
||||
timestamp = int(time.time())
|
||||
rows = [(timestamp, u, 0, 1, 0, 0, 0) for u in urls]
|
||||
sqlite.executemany(
|
||||
'INSERT OR IGNORE INTO uris '
|
||||
'(added,url,check_time,error,stale_count,retrievals,proxies_added) '
|
||||
'VALUES (?,?,?,?,?,?,?)',
|
||||
rows
|
||||
)
|
||||
sqlite.commit()
|
||||
_log('+%d url(s) from %s' % (len(urls), search), 'added')
|
||||
|
||||
|
||||
56
mysqlite.py
56
mysqlite.py
@@ -1,9 +1,29 @@
|
||||
import time, random, sys
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""SQLite wrapper with retry logic and WAL mode."""
|
||||
|
||||
import time
|
||||
import random
|
||||
import sys
|
||||
import sqlite3
|
||||
|
||||
class mysqlite:
|
||||
|
||||
class mysqlite(object):
|
||||
"""SQLite connection wrapper with automatic retry on lock."""
|
||||
|
||||
def __init__(self, database, factory=None):
|
||||
self.handle = sqlite3.connect(database)
|
||||
if factory is not None:
|
||||
self.handle.text_factory = factory
|
||||
self.cursor = self.handle.cursor()
|
||||
self.dbname = database
|
||||
# Enable WAL mode for better concurrency
|
||||
self.cursor.execute('PRAGMA journal_mode=WAL')
|
||||
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
||||
|
||||
def _try_op(self, op, query, args=None, rmin=1.5, rmax=7.0):
|
||||
while 1:
|
||||
"""Execute operation with retry on database lock."""
|
||||
while True:
|
||||
try:
|
||||
if query is None:
|
||||
return op()
|
||||
@@ -12,33 +32,31 @@ class mysqlite:
|
||||
else:
|
||||
return op(query, args)
|
||||
except sqlite3.OperationalError as e:
|
||||
if e.message == 'database is locked':
|
||||
print "zzZzzZZ: db is locked (%s)"%self.dbname
|
||||
err_msg = str(e)
|
||||
if 'database is locked' in err_msg:
|
||||
sys.stderr.write('zzZzzZZ: db is locked (%s)\n' % self.dbname)
|
||||
time.sleep(random.uniform(rmin, rmax))
|
||||
continue
|
||||
else:
|
||||
print '%s\nquery: %s\nargs: %s' % (str(sys.exc_info()), str(query), str(args))
|
||||
raise e
|
||||
sys.stderr.write('%s\nquery: %s\nargs: %s\n' % (
|
||||
str(sys.exc_info()), str(query), str(args)))
|
||||
raise
|
||||
|
||||
def execute(self, query, args = None, rmin=1.5, rmax=7.0):
|
||||
def execute(self, query, args=None, rmin=1.5, rmax=7.0):
|
||||
"""Execute a single query with retry."""
|
||||
return self._try_op(self.cursor.execute, query, args, rmin, rmax)
|
||||
|
||||
def executemany(self, query, args, rmin=1.5, rmax=7.0):
|
||||
while len(args):
|
||||
self._try_op(self.cursor.executemany, query, args[:500], rmin, rmax)
|
||||
"""Execute query for multiple argument sets, batched."""
|
||||
while args:
|
||||
batch = args[:500]
|
||||
self._try_op(self.cursor.executemany, query, batch, rmin, rmax)
|
||||
args = args[500:]
|
||||
|
||||
def commit(self, rmin=1.5, rmax=7.0):
|
||||
"""Commit transaction with retry."""
|
||||
return self._try_op(self.handle.commit, None, None, rmin, rmax)
|
||||
|
||||
def close(self):
|
||||
"""Close database connection."""
|
||||
self.handle.close()
|
||||
|
||||
def __init__(self, database, factory = None):
|
||||
self.handle = sqlite3.connect(database)
|
||||
if factory: self.handle.text_factory = factory
|
||||
self.cursor = self.handle.cursor()
|
||||
self.dbname = database
|
||||
# enable WAL mode for better concurrency
|
||||
self.cursor.execute('PRAGMA journal_mode=WAL')
|
||||
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import threading
|
||||
import time, random, string, re, copy
|
||||
import Queue
|
||||
import time
|
||||
import random
|
||||
import string
|
||||
import re
|
||||
import heapq
|
||||
|
||||
try:
|
||||
import Queue
|
||||
except ImportError:
|
||||
import queue as Queue
|
||||
try:
|
||||
import IP2Location
|
||||
import os
|
||||
@@ -22,7 +29,7 @@ import connection_pool
|
||||
config = Config()
|
||||
|
||||
_run_standalone = False
|
||||
cached_dns = dict()
|
||||
cached_dns = {}
|
||||
|
||||
regexes = {
|
||||
'www.facebook.com': 'X-FB-Debug',
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python2
|
||||
# HTML parsing with optional BeautifulSoup or stdlib fallback
|
||||
# -*- coding: utf-8 -*-
|
||||
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
import sys
|
||||
@@ -14,7 +15,9 @@ except ImportError:
|
||||
_bs4_available = False
|
||||
|
||||
|
||||
class Tag():
|
||||
class Tag(object):
|
||||
"""Simple tag representation for stdlib parser."""
|
||||
|
||||
def __init__(self, name, attrs):
|
||||
self.name = name
|
||||
self.attrs = dict(attrs)
|
||||
@@ -26,12 +29,15 @@ class Tag():
|
||||
return self.attrs.get(key, default)
|
||||
|
||||
|
||||
class SoupResult():
|
||||
class SoupResult(object):
|
||||
"""BeautifulSoup-like result wrapper for stdlib parser."""
|
||||
|
||||
def __init__(self, tags):
|
||||
self._tags = tags
|
||||
self.body = self
|
||||
|
||||
def find_all(self, tag_name, **kwargs):
|
||||
"""Find all tags matching criteria."""
|
||||
results = []
|
||||
for tag in self._tags:
|
||||
if tag.name != tag_name:
|
||||
@@ -46,6 +52,8 @@ class SoupResult():
|
||||
|
||||
|
||||
class LinkExtractor(HTMLParser):
|
||||
"""Extract tags from HTML using stdlib."""
|
||||
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.tags = []
|
||||
@@ -58,15 +66,17 @@ class LinkExtractor(HTMLParser):
|
||||
|
||||
|
||||
def _parse_stdlib(html):
|
||||
"""Parse HTML using stdlib HTMLParser."""
|
||||
parser = LinkExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
except Exception:
|
||||
pass # malformed HTML, return partial results
|
||||
pass # Malformed HTML, return partial results
|
||||
return SoupResult(parser.tags)
|
||||
|
||||
|
||||
def _parse_bs4(html):
|
||||
"""Parse HTML using BeautifulSoup."""
|
||||
try:
|
||||
return BeautifulSoup(html, 'lxml')
|
||||
except (FeatureNotFound, Exception):
|
||||
@@ -74,6 +84,7 @@ def _parse_bs4(html):
|
||||
|
||||
|
||||
def set_nobs(enabled):
|
||||
"""Disable BeautifulSoup and use stdlib instead."""
|
||||
global _use_bs4
|
||||
_use_bs4 = not enabled
|
||||
if enabled and _bs4_available:
|
||||
@@ -83,12 +94,13 @@ def set_nobs(enabled):
|
||||
|
||||
|
||||
def soupify(html, nohtml=False):
|
||||
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
||||
"""Parse HTML content, returning BeautifulSoup-like object."""
|
||||
htm = html if nohtml else '<html><body>%s</body></html>' % html
|
||||
if _use_bs4 and _bs4_available:
|
||||
return _parse_bs4(htm)
|
||||
else:
|
||||
return _parse_stdlib(htm)
|
||||
|
||||
|
||||
def is_available():
|
||||
"""Check if BeautifulSoup is available."""
|
||||
return _bs4_available
|
||||
|
||||
Reference in New Issue
Block a user