style: normalize indentation and improve code style
- convert tabs to 4-space indentation
- add docstrings to modules and classes
- remove unused import (copy)
- use explicit object inheritance
- use 'while True' over 'while 1'
- use 'while args' over 'while len(args)'
- use '{}' over 'dict()'
- consistent string formatting
- Python 2/3 compatible Queue import
This commit is contained in:
164
comboparse.py
164
comboparse.py
@@ -1,74 +1,110 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Combined config file and argument parser."""
|
||||||
|
|
||||||
from ConfigParser import SafeConfigParser, NoOptionError
|
from ConfigParser import SafeConfigParser, NoOptionError
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class _Dummy():
|
|
||||||
pass
|
class _Dummy(object):
|
||||||
|
"""Placeholder for config sections."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ComboParser(object):
|
class ComboParser(object):
|
||||||
def __init__(self, ini):
|
"""Parse configuration from INI file and command-line arguments.
|
||||||
self.items = []
|
|
||||||
self.cparser = SafeConfigParser()
|
|
||||||
self.aparser = ArgumentParser()
|
|
||||||
self.ini = ini
|
|
||||||
self.items = []
|
|
||||||
self.loaded = False
|
|
||||||
|
|
||||||
def add_item(self, section, name, type, default, desc, required):
|
Command-line arguments override INI file values.
|
||||||
def str2bool(val):
|
"""
|
||||||
return val in ['True', 'true', '1', 'yes']
|
|
||||||
self.items.append({
|
|
||||||
'section':section,
|
|
||||||
'name':name,
|
|
||||||
'type':type,
|
|
||||||
'default':default,
|
|
||||||
'required':required,
|
|
||||||
})
|
|
||||||
self.aparser.add_argument(
|
|
||||||
'--%s.%s'%(section, name),
|
|
||||||
help='%s, default: (%s)'%(desc, str(default)),
|
|
||||||
type=type if type is not bool else str2bool,
|
|
||||||
default=None,
|
|
||||||
required=False
|
|
||||||
)
|
|
||||||
def load(self):
|
|
||||||
if self.loaded: return
|
|
||||||
self.loaded = True
|
|
||||||
|
|
||||||
try:
|
def __init__(self, ini):
|
||||||
self.cparser.read(self.ini)
|
self.items = []
|
||||||
except Exception:
|
self.cparser = SafeConfigParser()
|
||||||
pass # config file missing or unreadable, use defaults
|
self.aparser = ArgumentParser()
|
||||||
args = self.aparser.parse_args()
|
self.ini = ini
|
||||||
for item in self.items:
|
self.loaded = False
|
||||||
try:
|
self.args = None
|
||||||
obj = getattr(self, item['section'])
|
|
||||||
except AttributeError:
|
|
||||||
setattr(self, item['section'], _Dummy())
|
|
||||||
obj = getattr(self, item['section'])
|
|
||||||
|
|
||||||
setattr(obj, item['name'], item['default'])
|
def add_item(self, section, name, type, default, desc, required):
|
||||||
inner = getattr(obj, item['name'])
|
"""Add a configuration item."""
|
||||||
|
def str2bool(val):
|
||||||
|
return val.lower() in ('true', '1', 'yes')
|
||||||
|
|
||||||
item['found'] = True
|
self.items.append({
|
||||||
try:
|
'section': section,
|
||||||
if item['type'] is bool : inner = self.cparser.getboolean(item['section'], item['name'])
|
'name': name,
|
||||||
elif item['type'] is float: inner = self.cparser.getfloat(item['section'], item['name'])
|
'type': type,
|
||||||
elif item['type'] is int : inner = self.cparser.getint(item['section'], item['name'])
|
'default': default,
|
||||||
elif item['type'] is str : inner = self.cparser.get(item['section'], item['name'])
|
'required': required,
|
||||||
except NoOptionError:
|
})
|
||||||
item['found'] = False
|
self.aparser.add_argument(
|
||||||
try:
|
'--%s.%s' % (section, name),
|
||||||
arg = getattr(args, '%s.%s'%(item['section'], item['name']))
|
help='%s (default: %s)' % (desc, default),
|
||||||
if arg is not None:
|
type=type if type is not bool else str2bool,
|
||||||
inner = arg
|
default=None,
|
||||||
item['found'] = True
|
required=False
|
||||||
except AttributeError:
|
)
|
||||||
pass # arg not provided on command line
|
|
||||||
if not item['found']:
|
def load(self):
|
||||||
if item['required']:
|
"""Load configuration from file and command-line."""
|
||||||
sys.stderr.write('error: required config item "%s" not found in section "%s" of "%s"!\n'%(item['name'], item['section'], self.ini))
|
if self.loaded:
|
||||||
sys.exit(1)
|
return
|
||||||
else:
|
self.loaded = True
|
||||||
sys.stderr.write('warning: assigned default value of "%s" to "%s.%s"\n'%(str(item['default']), item['section'], item['name']))
|
|
||||||
setattr(obj, item['name'], inner)
|
try:
|
||||||
|
self.cparser.read(self.ini)
|
||||||
|
except Exception:
|
||||||
|
pass # Config file missing or unreadable, use defaults
|
||||||
|
|
||||||
|
self.args = self.aparser.parse_args()
|
||||||
|
|
||||||
|
for item in self.items:
|
||||||
|
section = item['section']
|
||||||
|
name = item['name']
|
||||||
|
|
||||||
|
# Ensure section object exists
|
||||||
|
if not hasattr(self, section):
|
||||||
|
setattr(self, section, _Dummy())
|
||||||
|
obj = getattr(self, section)
|
||||||
|
|
||||||
|
# Start with default value
|
||||||
|
value = item['default']
|
||||||
|
found = False
|
||||||
|
|
||||||
|
# Try to read from config file
|
||||||
|
try:
|
||||||
|
if item['type'] is bool:
|
||||||
|
value = self.cparser.getboolean(section, name)
|
||||||
|
elif item['type'] is float:
|
||||||
|
value = self.cparser.getfloat(section, name)
|
||||||
|
elif item['type'] is int:
|
||||||
|
value = self.cparser.getint(section, name)
|
||||||
|
elif item['type'] is str:
|
||||||
|
value = self.cparser.get(section, name)
|
||||||
|
found = True
|
||||||
|
except NoOptionError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Command-line overrides config file
|
||||||
|
arg_name = '%s.%s' % (section, name)
|
||||||
|
arg_value = getattr(self.args, arg_name, None)
|
||||||
|
if arg_value is not None:
|
||||||
|
value = arg_value
|
||||||
|
found = True
|
||||||
|
|
||||||
|
# Handle missing required items
|
||||||
|
if not found:
|
||||||
|
if item['required']:
|
||||||
|
sys.stderr.write(
|
||||||
|
'error: required config item "%s" not found in section "%s" of "%s"\n'
|
||||||
|
% (name, section, self.ini)
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
sys.stderr.write(
|
||||||
|
'warning: assigned default value of "%s" to "%s.%s"\n'
|
||||||
|
% (item['default'], section, name)
|
||||||
|
)
|
||||||
|
|
||||||
|
setattr(obj, name, value)
|
||||||
|
|||||||
278
config.py
278
config.py
@@ -3,159 +3,159 @@ from misc import set_log_level, _log
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
class Config(ComboParser):
|
class Config(ComboParser):
|
||||||
def load(self):
|
def load(self):
|
||||||
super(Config, self).load()
|
super(Config, self).load()
|
||||||
self.torhosts = [ str(i).strip() for i in self.common.tor_hosts.split(',') ]
|
self.torhosts = [ str(i).strip() for i in self.common.tor_hosts.split(',') ]
|
||||||
#with open('servers.txt', 'r') as handle:
|
#with open('servers.txt', 'r') as handle:
|
||||||
with open(self.watchd.source_file, 'r') as handle:
|
with open(self.watchd.source_file, 'r') as handle:
|
||||||
self.servers = [x.strip() for x in handle.readlines() if len(x.strip()) > 0]
|
self.servers = [x.strip() for x in handle.readlines() if len(x.strip()) > 0]
|
||||||
# Apply log level from CLI flags
|
# Apply log level from CLI flags
|
||||||
if self.args.quiet:
|
if self.args.quiet:
|
||||||
set_log_level('warn')
|
set_log_level('warn')
|
||||||
elif self.args.verbose:
|
elif self.args.verbose:
|
||||||
set_log_level('debug')
|
set_log_level('debug')
|
||||||
|
|
||||||
def validate(self):
|
def validate(self):
|
||||||
"""Validate configuration values. Returns list of errors."""
|
"""Validate configuration values. Returns list of errors."""
|
||||||
errors = []
|
errors = []
|
||||||
warnings = []
|
warnings = []
|
||||||
|
|
||||||
# Validate port numbers
|
# Validate port numbers
|
||||||
if not 1 <= self.httpd.port <= 65535:
|
if not 1 <= self.httpd.port <= 65535:
|
||||||
errors.append('httpd.port must be 1-65535, got %d' % self.httpd.port)
|
errors.append('httpd.port must be 1-65535, got %d' % self.httpd.port)
|
||||||
|
|
||||||
# Validate timeouts (must be positive)
|
# Validate timeouts (must be positive)
|
||||||
if self.common.timeout_connect <= 0:
|
if self.common.timeout_connect <= 0:
|
||||||
errors.append('common.timeout_connect must be > 0')
|
errors.append('common.timeout_connect must be > 0')
|
||||||
if self.common.timeout_read <= 0:
|
if self.common.timeout_read <= 0:
|
||||||
errors.append('common.timeout_read must be > 0')
|
errors.append('common.timeout_read must be > 0')
|
||||||
if self.watchd.timeout <= 0:
|
if self.watchd.timeout <= 0:
|
||||||
errors.append('watchd.timeout must be > 0')
|
errors.append('watchd.timeout must be > 0')
|
||||||
if self.ppf.timeout <= 0:
|
if self.ppf.timeout <= 0:
|
||||||
errors.append('ppf.timeout must be > 0')
|
errors.append('ppf.timeout must be > 0')
|
||||||
|
|
||||||
# Validate thread counts
|
# Validate thread counts
|
||||||
if self.watchd.threads < 1:
|
if self.watchd.threads < 1:
|
||||||
errors.append('watchd.threads must be >= 1')
|
errors.append('watchd.threads must be >= 1')
|
||||||
if self.ppf.threads < 1:
|
if self.ppf.threads < 1:
|
||||||
errors.append('ppf.threads must be >= 1')
|
errors.append('ppf.threads must be >= 1')
|
||||||
|
|
||||||
# Validate max_fail
|
# Validate max_fail
|
||||||
if self.watchd.max_fail < 1:
|
if self.watchd.max_fail < 1:
|
||||||
errors.append('watchd.max_fail must be >= 1')
|
errors.append('watchd.max_fail must be >= 1')
|
||||||
if self.ppf.max_fail < 1:
|
if self.ppf.max_fail < 1:
|
||||||
errors.append('ppf.max_fail must be >= 1')
|
errors.append('ppf.max_fail must be >= 1')
|
||||||
|
|
||||||
# Validate engine names
|
# Validate engine names
|
||||||
valid_engines = {'duckduckgo', 'startpage', 'brave', 'ecosia',
|
valid_engines = {'duckduckgo', 'startpage', 'brave', 'ecosia',
|
||||||
'mojeek', 'qwant', 'yandex', 'github', 'gitlab',
|
'mojeek', 'qwant', 'yandex', 'github', 'gitlab',
|
||||||
'codeberg', 'gitea', 'searx'}
|
'codeberg', 'gitea', 'searx'}
|
||||||
configured = [e.strip().lower() for e in self.scraper.engines.split(',')]
|
configured = [e.strip().lower() for e in self.scraper.engines.split(',')]
|
||||||
for eng in configured:
|
for eng in configured:
|
||||||
if eng and eng not in valid_engines:
|
if eng and eng not in valid_engines:
|
||||||
warnings.append('unknown engine: %s' % eng)
|
warnings.append('unknown engine: %s' % eng)
|
||||||
|
|
||||||
# Validate source_file exists
|
# Validate source_file exists
|
||||||
if not os.path.exists(self.watchd.source_file):
|
if not os.path.exists(self.watchd.source_file):
|
||||||
warnings.append('source_file not found: %s' % self.watchd.source_file)
|
warnings.append('source_file not found: %s' % self.watchd.source_file)
|
||||||
|
|
||||||
# Validate database directories are writable
|
# Validate database directories are writable
|
||||||
for db in (self.watchd.database, self.ppf.database):
|
for db in (self.watchd.database, self.ppf.database):
|
||||||
db_dir = os.path.dirname(db) or '.'
|
db_dir = os.path.dirname(db) or '.'
|
||||||
if not os.access(db_dir, os.W_OK):
|
if not os.access(db_dir, os.W_OK):
|
||||||
errors.append('database directory not writable: %s' % db_dir)
|
errors.append('database directory not writable: %s' % db_dir)
|
||||||
|
|
||||||
# Log warnings
|
# Log warnings
|
||||||
for w in warnings:
|
for w in warnings:
|
||||||
_log(w, 'warn')
|
_log(w, 'warn')
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Config, self).__init__('config.ini')
|
super(Config, self).__init__('config.ini')
|
||||||
section = 'common'
|
section = 'common'
|
||||||
self.add_item(section, 'tor_hosts', str, '127.0.0.1:9050', 'comma-separated list of tor proxy address(es)', True)
|
self.add_item(section, 'tor_hosts', str, '127.0.0.1:9050', 'comma-separated list of tor proxy address(es)', True)
|
||||||
self.add_item(section, 'timeout_connect', int, 10, 'connection timeout in seconds (default: 10)', False)
|
self.add_item(section, 'timeout_connect', int, 10, 'connection timeout in seconds (default: 10)', False)
|
||||||
self.add_item(section, 'timeout_read', int, 15, 'read timeout in seconds (default: 15)', False)
|
self.add_item(section, 'timeout_read', int, 15, 'read timeout in seconds (default: 15)', False)
|
||||||
|
|
||||||
section = 'watchd'
|
section = 'watchd'
|
||||||
self.add_item(section, 'outage_threshold', float, 4.0, 'mininum success percentage required to not drop check results', False)
|
self.add_item(section, 'outage_threshold', float, 4.0, 'mininum success percentage required to not drop check results', False)
|
||||||
self.add_item(section, 'max_fail', int, 5, 'number of fails after which a proxy is considered dead', False)
|
self.add_item(section, 'max_fail', int, 5, 'number of fails after which a proxy is considered dead', False)
|
||||||
self.add_item(section, 'threads', int, 10, 'number of threads watchd uses to check proxies', True)
|
self.add_item(section, 'threads', int, 10, 'number of threads watchd uses to check proxies', True)
|
||||||
self.add_item(section, 'timeout', int, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
self.add_item(section, 'timeout', int, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
||||||
self.add_item(section, 'submit_after', int, 200, 'min. number of tested proxies for DB write', False)
|
self.add_item(section, 'submit_after', int, 200, 'min. number of tested proxies for DB write', False)
|
||||||
self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False)
|
self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False)
|
||||||
self.add_item(section, 'use_ssl', int, 0, 'whether to use SSL and port 6697 to connect to targets (slower)', False)
|
self.add_item(section, 'use_ssl', int, 0, 'whether to use SSL and port 6697 to connect to targets (slower)', False)
|
||||||
self.add_item(section, 'checktime', int, 1800, 'base checking interval for proxies in db in seconds', False)
|
self.add_item(section, 'checktime', int, 1800, 'base checking interval for proxies in db in seconds', False)
|
||||||
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for proxies in db in seconds per experienced failure', False)
|
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for proxies in db in seconds per experienced failure', False)
|
||||||
self.add_item(section, 'database', str, 'websites.sqlite', 'filename of database', True)
|
self.add_item(section, 'database', str, 'websites.sqlite', 'filename of database', True)
|
||||||
self.add_item(section, 'oldies', bool, False, 're-test old proxies as well ? (default: False)', False)
|
self.add_item(section, 'oldies', bool, False, 're-test old proxies as well ? (default: False)', False)
|
||||||
self.add_item(section, 'oldies_checktime', int, 43200, 'base checking interval for *old* proxies in seconds (default: 43200)', False)
|
self.add_item(section, 'oldies_checktime', int, 43200, 'base checking interval for *old* proxies in seconds (default: 43200)', False)
|
||||||
self.add_item(section, 'oldies_multi', int, 10, 'fetch threads*multi rows when testing oldies (default: 10)', False)
|
self.add_item(section, 'oldies_multi', int, 10, 'fetch threads*multi rows when testing oldies (default: 10)', False)
|
||||||
self.add_item(section, 'source_file', str, 'servers.txt', 'server/url list to read from (default: servers.txt)', False)
|
self.add_item(section, 'source_file', str, 'servers.txt', 'server/url list to read from (default: servers.txt)', False)
|
||||||
self.add_item(section, 'stale_days', int, 30, 'days after which dead proxies are removed (default: 30)', False)
|
self.add_item(section, 'stale_days', int, 30, 'days after which dead proxies are removed (default: 30)', False)
|
||||||
self.add_item(section, 'stats_interval', int, 300, 'seconds between status reports (default: 300)', False)
|
self.add_item(section, 'stats_interval', int, 300, 'seconds between status reports (default: 300)', False)
|
||||||
self.add_item(section, 'tor_safeguard', bool, True, 'enable tor safeguard (default: True)', False)
|
self.add_item(section, 'tor_safeguard', bool, True, 'enable tor safeguard (default: True)', False)
|
||||||
self.add_item(section, 'checktype', str, 'http', 'check type (irc or http)', False)
|
self.add_item(section, 'checktype', str, 'http', 'check type (irc or http)', False)
|
||||||
|
|
||||||
section = 'httpd'
|
section = 'httpd'
|
||||||
self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True)
|
self.add_item(section, 'listenip', str, '127.0.0.1', 'address for the httpd to listen to (default: 127.0.0.1)', True)
|
||||||
self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True)
|
self.add_item(section, 'port', int, 8081, 'port for the httpd to listen to (default: 8081)', True)
|
||||||
self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True)
|
self.add_item(section, 'enabled', bool, False, 'start httpd (default: False)', True)
|
||||||
|
|
||||||
section = 'ppf'
|
section = 'ppf'
|
||||||
self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False)
|
self.add_item(section, 'debug', bool, False, 'whether to print additional debug info', False)
|
||||||
self.add_item(section, 'search', bool, True, 'whether to use searx search engine to find new proxy lists', False)
|
self.add_item(section, 'search', bool, True, 'whether to use searx search engine to find new proxy lists', False)
|
||||||
self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
self.add_item(section, 'timeout', float, 15, 'timeout for blocking operations (connect/recv/...) for proxy checks in seconds', False)
|
||||||
self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False)
|
self.add_item(section, 'http_retries', int, 1, 'number of retries for http connects', False)
|
||||||
self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False)
|
self.add_item(section, 'threads', int, 1, 'number of threads to run (default: 1)', False)
|
||||||
self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False)
|
self.add_item(section, 'checktime', int, 3600, 'base checking interval for urls in db in seconds', False)
|
||||||
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
self.add_item(section, 'perfail_checktime', int, 3600, 'additional checking interval for urls in db in seconds per resultless check', False)
|
||||||
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
self.add_item(section, 'max_fail', int, 5, 'number of fails after which an url is considered dead', False)
|
||||||
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
self.add_item(section, 'database', str, 'proxies.sqlite', 'filename of database', True)
|
||||||
self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)
|
self.add_item(section, 'extract_samedomain', bool, False, 'extract only url from same domains? (default: False)', False)
|
||||||
|
|
||||||
section = 'scraper'
|
section = 'scraper'
|
||||||
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
|
self.add_item(section, 'debug', bool, False, 'scraper: whether to print additional debug info', False)
|
||||||
self.add_item(section, 'query', str, 'psw', 'build query using Proxies, Search, Websites', False)
|
self.add_item(section, 'query', str, 'psw', 'build query using Proxies, Search, Websites', False)
|
||||||
self.add_item(section, 'backoff_base', int, 30, 'base backoff delay in seconds (default: 30)', False)
|
self.add_item(section, 'backoff_base', int, 30, 'base backoff delay in seconds (default: 30)', False)
|
||||||
self.add_item(section, 'backoff_max', int, 3600, 'max backoff delay in seconds (default: 3600)', False)
|
self.add_item(section, 'backoff_max', int, 3600, 'max backoff delay in seconds (default: 3600)', False)
|
||||||
self.add_item(section, 'fail_threshold', int, 2, 'consecutive failures before backoff (default: 2)', False)
|
self.add_item(section, 'fail_threshold', int, 2, 'consecutive failures before backoff (default: 2)', False)
|
||||||
self.add_item(section, 'engines', str, 'searx,duckduckgo,github', 'comma-separated search engines (default: searx,duckduckgo,github)', False)
|
self.add_item(section, 'engines', str, 'searx,duckduckgo,github', 'comma-separated search engines (default: searx,duckduckgo,github)', False)
|
||||||
self.add_item(section, 'max_pages', int, 5, 'max pages to fetch per engine query (default: 5)', False)
|
self.add_item(section, 'max_pages', int, 5, 'max pages to fetch per engine query (default: 5)', False)
|
||||||
self.add_item(section, 'libretranslate_url', str, 'https://lt.mymx.me/translate', 'LibreTranslate API URL (default: https://lt.mymx.me/translate)', False)
|
self.add_item(section, 'libretranslate_url', str, 'https://lt.mymx.me/translate', 'LibreTranslate API URL (default: https://lt.mymx.me/translate)', False)
|
||||||
self.add_item(section, 'libretranslate_enabled', bool, True, 'enable LibreTranslate for dynamic translations (default: True)', False)
|
self.add_item(section, 'libretranslate_enabled', bool, True, 'enable LibreTranslate for dynamic translations (default: True)', False)
|
||||||
|
|
||||||
self.aparser.add_argument("--file", help="import a single file containing proxy addrs", type=str, default='', required=False)
|
self.aparser.add_argument("--file", help="import a single file containing proxy addrs", type=str, default='', required=False)
|
||||||
self.aparser.add_argument("--nobs", help="disable BeautifulSoup, use stdlib HTMLParser", action='store_true', default=False)
|
self.aparser.add_argument("--nobs", help="disable BeautifulSoup, use stdlib HTMLParser", action='store_true', default=False)
|
||||||
self.aparser.add_argument("-q", "--quiet", help="suppress info messages, show warnings and errors only", action='store_true', default=False)
|
self.aparser.add_argument("-q", "--quiet", help="suppress info messages, show warnings and errors only", action='store_true', default=False)
|
||||||
self.aparser.add_argument("-v", "--verbose", help="show debug messages", action='store_true', default=False)
|
self.aparser.add_argument("-v", "--verbose", help="show debug messages", action='store_true', default=False)
|
||||||
|
|
||||||
section = 'flood'
|
section = 'flood'
|
||||||
self.add_item(section, 'server', str, None, 'irc server address', False)
|
self.add_item(section, 'server', str, None, 'irc server address', False)
|
||||||
self.add_item(section, 'target', str, None, 'target to flood', False)
|
self.add_item(section, 'target', str, None, 'target to flood', False)
|
||||||
self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False)
|
self.add_item(section, 'nickserv', str, 'nickserv', "nickserv's nickname", False)
|
||||||
self.add_item(section, 'message', str, None, 'message', False)
|
self.add_item(section, 'message', str, None, 'message', False)
|
||||||
self.add_item(section, 'threads', int, 1, '# of threads', False)
|
self.add_item(section, 'threads', int, 1, '# of threads', False)
|
||||||
self.add_item(section, 'register', int, 0, 'register nickname when required', False)
|
self.add_item(section, 'register', int, 0, 'register nickname when required', False)
|
||||||
|
|
||||||
self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False)
|
self.add_item(section, 'wait', int, 0, 'wait prior sending messages', False)
|
||||||
self.add_item(section, 'once', int, 0, 'quit as soon as possible', False)
|
self.add_item(section, 'once', int, 0, 'quit as soon as possible', False)
|
||||||
self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False)
|
self.add_item(section, 'hilight', int, 0, 'try to hilight all nicks?', False)
|
||||||
self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False)
|
self.add_item(section, 'waitonsuccess', int, 0, 'wait for a while on success', False)
|
||||||
self.add_item(section, 'debug', int, 0, 'use debug', False)
|
self.add_item(section, 'debug', int, 0, 'use debug', False)
|
||||||
self.add_item(section, 'duration', int, 180, 'maximum time to run', False)
|
self.add_item(section, 'duration', int, 180, 'maximum time to run', False)
|
||||||
self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False)
|
self.add_item(section, 'delay', str, 14400, 'if waitonsuccess, wait for $delay before sending other bots', False)
|
||||||
self.add_item(section, 'nick', str, None, 'specify nickname to use', False)
|
self.add_item(section, 'nick', str, None, 'specify nickname to use', False)
|
||||||
self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False)
|
self.add_item(section, 'use_ssl', int, 2, 'Use ssl? (0: false, 1: true, 2: random)', False)
|
||||||
self.add_item(section, 'cycle', int, 0, 'cycle flood', False)
|
self.add_item(section, 'cycle', int, 0, 'cycle flood', False)
|
||||||
self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False)
|
self.add_item(section, 'change_nick', int, 0, 'Change nick between messages (useful when flooding privates)', False)
|
||||||
self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False)
|
self.add_item(section, 'use_timeout', int, 0, 'make connexions quit through timeout', False)
|
||||||
self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False)
|
self.add_item(section, 'clones', int, 1, 'Number of connexion repeat to run', False)
|
||||||
self.add_item(section, 'query', bool, False, 'also flood in query', False)
|
self.add_item(section, 'query', bool, False, 'also flood in query', False)
|
||||||
self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False)
|
self.add_item(section, 'noquerybefore', int, 10, 'do not send query before x secs being connected', False)
|
||||||
self.add_item(section, 'oper', bool, False, 'piss of opers', False)
|
self.add_item(section, 'oper', bool, False, 'piss of opers', False)
|
||||||
self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False)
|
self.add_item(section, 'whois', bool, False, 'piss of opers with /whois', False)
|
||||||
self.add_item(section, 'modex', bool, False, 'make +/- x mode', False)
|
self.add_item(section, 'modex', bool, False, 'make +/- x mode', False)
|
||||||
self.add_item(section, 'os', bool, False, 'piss off opers with /os', False)
|
self.add_item(section, 'os', bool, False, 'piss off opers with /os', False)
|
||||||
self.add_item(section, 'file', str, None, 'read flood content from file', False)
|
self.add_item(section, 'file', str, None, 'read flood content from file', False)
|
||||||
self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False)
|
self.add_item(section, 'failid', str, None, 'generate nickserv warn. about IDENTIFY attempts', False)
|
||||||
|
|||||||
119
dbs.py
119
dbs.py
@@ -1,61 +1,80 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""Database table creation and insertion utilities."""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
from misc import _log
|
from misc import _log
|
||||||
|
|
||||||
|
|
||||||
def create_table_if_not_exists(sqlite, dbname):
|
def create_table_if_not_exists(sqlite, dbname):
|
||||||
if dbname == 'proxylist':
|
"""Create database table with indexes if it doesn't exist."""
|
||||||
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
if dbname == 'proxylist':
|
||||||
proxy BLOB UNIQUE,
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS proxylist (
|
||||||
country BLOB,
|
proxy BLOB UNIQUE,
|
||||||
added INT,
|
country BLOB,
|
||||||
failed INT,
|
added INT,
|
||||||
tested INT,
|
failed INT,
|
||||||
dronebl INT,
|
tested INT,
|
||||||
proto TEXT,
|
dronebl INT,
|
||||||
mitm INT,
|
proto TEXT,
|
||||||
success_count INT,
|
mitm INT,
|
||||||
ip TEXT,
|
success_count INT,
|
||||||
port INT,
|
ip TEXT,
|
||||||
consecutive_success INT,
|
port INT,
|
||||||
total_duration INT)""")
|
consecutive_success INT,
|
||||||
# indexes for common query patterns
|
total_duration INT)""")
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
# Indexes for common query patterns
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_failed ON proxylist(failed)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_tested ON proxylist(tested)')
|
||||||
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_proxylist_proto ON proxylist(proto)')
|
||||||
|
|
||||||
elif dbname == 'uris':
|
elif dbname == 'uris':
|
||||||
sqlite.execute("""CREATE TABLE IF NOT EXISTS uris (
|
sqlite.execute("""CREATE TABLE IF NOT EXISTS uris (
|
||||||
url TEXT UNIQUE,
|
url TEXT UNIQUE,
|
||||||
content_type TEXT,
|
content_type TEXT,
|
||||||
check_time INT,
|
check_time INT,
|
||||||
error INT,
|
error INT,
|
||||||
stale_count INT,
|
stale_count INT,
|
||||||
retrievals INT,
|
retrievals INT,
|
||||||
proxies_added INT,
|
proxies_added INT,
|
||||||
added INT
|
added INT)""")
|
||||||
)""")
|
# Indexes for common query patterns
|
||||||
# indexes for common query patterns
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_error ON uris(error)')
|
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
||||||
sqlite.execute('CREATE INDEX IF NOT EXISTS idx_uris_checktime ON uris(check_time)')
|
|
||||||
|
sqlite.commit()
|
||||||
|
|
||||||
sqlite.commit()
|
|
||||||
|
|
||||||
def insert_proxies(proxydb, proxies, url):
|
def insert_proxies(proxydb, proxies, url):
|
||||||
if not proxies: return
|
"""Insert new proxies into database."""
|
||||||
timestamp = int(time.time())
|
if not proxies:
|
||||||
rows = []
|
return
|
||||||
for p in proxies:
|
timestamp = int(time.time())
|
||||||
ip, port = p.split(':')
|
rows = []
|
||||||
rows.append((timestamp,p,ip,port,3,0,0,0,0,0))
|
for p in proxies:
|
||||||
proxydb.executemany('INSERT OR IGNORE INTO proxylist (added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) VALUES (?,?,?,?,?,?,?,?,?,?)', rows)
|
ip, port = p.split(':')
|
||||||
proxydb.commit()
|
rows.append((timestamp, p, ip, port, 3, 0, 0, 0, 0, 0))
|
||||||
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
proxydb.executemany(
|
||||||
|
'INSERT OR IGNORE INTO proxylist '
|
||||||
|
'(added,proxy,ip,port,failed,tested,success_count,total_duration,mitm,consecutive_success) '
|
||||||
|
'VALUES (?,?,?,?,?,?,?,?,?,?)',
|
||||||
|
rows
|
||||||
|
)
|
||||||
|
proxydb.commit()
|
||||||
|
_log('+%d proxy/ies from %s' % (len(proxies), url), 'added')
|
||||||
|
|
||||||
|
|
||||||
def insert_urls(urls, search, sqlite):
|
def insert_urls(urls, search, sqlite):
|
||||||
if not urls: return
|
"""Insert new URLs into database."""
|
||||||
time_now = int(time.time())
|
if not urls:
|
||||||
rows = [ (time_now,u,0,1,0,0,0) for u in urls ]
|
return
|
||||||
sqlite.executemany('INSERT OR IGNORE INTO uris (added,url,check_time,error,stale_count,retrievals,proxies_added) values(?,?,?,?,?,?,?)', rows)
|
timestamp = int(time.time())
|
||||||
sqlite.commit()
|
rows = [(timestamp, u, 0, 1, 0, 0, 0) for u in urls]
|
||||||
_log('+%d url(s) from %s' % (len(urls), search), 'added')
|
sqlite.executemany(
|
||||||
|
'INSERT OR IGNORE INTO uris '
|
||||||
|
'(added,url,check_time,error,stale_count,retrievals,proxies_added) '
|
||||||
|
'VALUES (?,?,?,?,?,?,?)',
|
||||||
|
rows
|
||||||
|
)
|
||||||
|
sqlite.commit()
|
||||||
|
_log('+%d url(s) from %s' % (len(urls), search), 'added')
|
||||||
|
|||||||
248
fetch.py
248
fetch.py
@@ -6,169 +6,169 @@ from misc import _log
|
|||||||
|
|
||||||
config = None
|
config = None
|
||||||
def set_config(cfg):
|
def set_config(cfg):
|
||||||
global config
|
global config
|
||||||
config = cfg
|
config = cfg
|
||||||
|
|
||||||
cleanhtml_re = [
|
cleanhtml_re = [
|
||||||
re.compile('<.*?>'),
|
re.compile('<.*?>'),
|
||||||
re.compile('\s+'),
|
re.compile('\s+'),
|
||||||
re.compile('::+'),
|
re.compile('::+'),
|
||||||
]
|
]
|
||||||
def cleanhtml(raw_html):
|
def cleanhtml(raw_html):
|
||||||
html = raw_html.replace(' ', ' ')
|
html = raw_html.replace(' ', ' ')
|
||||||
html = re.sub(cleanhtml_re[0], ':', html)
|
html = re.sub(cleanhtml_re[0], ':', html)
|
||||||
html = re.sub(cleanhtml_re[1], ':', html)
|
html = re.sub(cleanhtml_re[1], ':', html)
|
||||||
html = re.sub(cleanhtml_re[2], ':', html)
|
html = re.sub(cleanhtml_re[2], ':', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def fetch_contents(url, head=False, proxy=None):
|
def fetch_contents(url, head=False, proxy=None):
|
||||||
content = None
|
content = None
|
||||||
if proxy is not None and len(proxy):
|
if proxy is not None and len(proxy):
|
||||||
for p in proxy:
|
for p in proxy:
|
||||||
content = _fetch_contents(url, head=head, proxy=p)
|
content = _fetch_contents(url, head=head, proxy=p)
|
||||||
if content is not None: break
|
if content is not None: break
|
||||||
|
|
||||||
else:
|
else:
|
||||||
content = _fetch_contents(url, head=head)
|
content = _fetch_contents(url, head=head)
|
||||||
|
|
||||||
return content if content is not None else ''
|
return content if content is not None else ''
|
||||||
|
|
||||||
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
retry_messages = ('Engines cannot retrieve results', 'Rate limit exceeded')
|
||||||
def _fetch_contents(url, head = False, proxy=None):
|
def _fetch_contents(url, head = False, proxy=None):
|
||||||
host, port, ssl, uri = _parse_url(url)
|
host, port, ssl, uri = _parse_url(url)
|
||||||
headers=[
|
headers=[
|
||||||
'Accept-Language: en-US,en;q=0.8',
|
'Accept-Language: en-US,en;q=0.8',
|
||||||
'Cache-Control: max-age=0',
|
'Cache-Control: max-age=0',
|
||||||
]
|
]
|
||||||
if config.ppf.debug:
|
if config.ppf.debug:
|
||||||
_log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
|
_log("connecting to %s... (header: %s)" % (url, str(head)), "debug")
|
||||||
while True:
|
while True:
|
||||||
proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
|
proxies = [rocksock.RocksockProxyFromURL('socks4://%s' % random.choice( config.torhosts ))]
|
||||||
if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))
|
if proxy: proxies.append( rocksock.RocksockProxyFromURL(proxy))
|
||||||
|
|
||||||
http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0')
|
http = RsHttp(host,ssl=ssl,port=port, keep_alive=True, timeout=config.ppf.timeout, max_tries=config.ppf.http_retries, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0')
|
||||||
if not http.connect():
|
if not http.connect():
|
||||||
_log("failed to connect to %s"%url, "ppf")
|
_log("failed to connect to %s"%url, "ppf")
|
||||||
e = http.get_last_rocksock_exception()
|
e = http.get_last_rocksock_exception()
|
||||||
if not e:
|
if not e:
|
||||||
return None
|
return None
|
||||||
et = e.get_errortype()
|
et = e.get_errortype()
|
||||||
ee = e.get_error()
|
ee = e.get_error()
|
||||||
ef = e.get_failedproxy()
|
ef = e.get_failedproxy()
|
||||||
if et == rocksock.RS_ET_OWN and \
|
if et == rocksock.RS_ET_OWN and \
|
||||||
ee == rocksock.RS_E_TARGET_CONN_REFUSED \
|
ee == rocksock.RS_E_TARGET_CONN_REFUSED \
|
||||||
and ef == 0:
|
and ef == 0:
|
||||||
_log("could not connect to proxy 0 - check your connection", "error")
|
_log("could not connect to proxy 0 - check your connection", "error")
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
continue
|
continue
|
||||||
return None
|
return None
|
||||||
break
|
break
|
||||||
|
|
||||||
## only request header
|
## only request header
|
||||||
if head:
|
if head:
|
||||||
hdr = http.head(uri, headers)
|
hdr = http.head(uri, headers)
|
||||||
return hdr
|
return hdr
|
||||||
|
|
||||||
hdr, res = http.get(uri, headers)
|
hdr, res = http.get(uri, headers)
|
||||||
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
res = res.encode('utf-8') if isinstance(res, unicode) else res
|
||||||
for retry_message in retry_messages:
|
for retry_message in retry_messages:
|
||||||
if retry_message in res: return None
|
if retry_message in res: return None
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def valid_port(port):
|
def valid_port(port):
|
||||||
return port > 0 and port < 65535
|
return port > 0 and port < 65535
|
||||||
|
|
||||||
def is_usable_proxy(proxy):
|
def is_usable_proxy(proxy):
|
||||||
ip, port = proxy.split(':')
|
ip, port = proxy.split(':')
|
||||||
if not valid_port(int(port)): return False
|
if not valid_port(int(port)): return False
|
||||||
|
|
||||||
octets = ip.split('.')
|
octets = ip.split('.')
|
||||||
A = int(octets[0])
|
A = int(octets[0])
|
||||||
B = int(octets[1])
|
B = int(octets[1])
|
||||||
C = int(octets[2])
|
C = int(octets[2])
|
||||||
D = int(octets[3])
|
D = int(octets[3])
|
||||||
|
|
||||||
if (A < 1 or A > 254 or \
|
if (A < 1 or A > 254 or \
|
||||||
B > 255 or C > 255 or D > 255) or \
|
B > 255 or C > 255 or D > 255) or \
|
||||||
(A == 10 or A == 127) or \
|
(A == 10 or A == 127) or \
|
||||||
(A == 192 and B == 168) or \
|
(A == 192 and B == 168) or \
|
||||||
(A == 172 and B >= 16 and B <= 31): return False
|
(A == 172 and B >= 16 and B <= 31): return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
_known_proxies = {}
|
_known_proxies = {}
|
||||||
|
|
||||||
def init_known_proxies(proxydb):
|
def init_known_proxies(proxydb):
|
||||||
"""Initialize known proxies cache from database."""
|
"""Initialize known proxies cache from database."""
|
||||||
global _known_proxies
|
global _known_proxies
|
||||||
if _known_proxies:
|
if _known_proxies:
|
||||||
return
|
return
|
||||||
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
|
known = proxydb.execute('SELECT proxy FROM proxylist').fetchall()
|
||||||
for k in known:
|
for k in known:
|
||||||
_known_proxies[k[0]] = True
|
_known_proxies[k[0]] = True
|
||||||
|
|
||||||
def add_known_proxies(proxies):
|
def add_known_proxies(proxies):
|
||||||
"""Add proxies to known cache."""
|
"""Add proxies to known cache."""
|
||||||
global _known_proxies
|
global _known_proxies
|
||||||
for p in proxies:
|
for p in proxies:
|
||||||
_known_proxies[p] = True
|
_known_proxies[p] = True
|
||||||
|
|
||||||
def is_known_proxy(proxy):
|
def is_known_proxy(proxy):
|
||||||
"""Check if proxy is in known cache."""
|
"""Check if proxy is in known cache."""
|
||||||
return proxy in _known_proxies
|
return proxy in _known_proxies
|
||||||
|
|
||||||
def extract_proxies(content, proxydb=None, filter_known=True):
|
def extract_proxies(content, proxydb=None, filter_known=True):
|
||||||
"""Extract and normalize proxy addresses from content.
|
"""Extract and normalize proxy addresses from content.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
content: HTML/text content to parse
|
content: HTML/text content to parse
|
||||||
proxydb: Database connection for known proxy lookup (optional)
|
proxydb: Database connection for known proxy lookup (optional)
|
||||||
filter_known: If True, filter out known proxies and return new only
|
filter_known: If True, filter out known proxies and return new only
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
If filter_known: (unique_count, new_proxies) tuple
|
If filter_known: (unique_count, new_proxies) tuple
|
||||||
If not filter_known: list of all unique valid proxies
|
If not filter_known: list of all unique valid proxies
|
||||||
"""
|
"""
|
||||||
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
matches = re.findall(r'([0-9]+(?:\.[0-9]+){3}:[0-9]{2,5})[\D$]', cleanhtml(content))
|
||||||
|
|
||||||
uniques_dict = {}
|
uniques_dict = {}
|
||||||
for p in matches:
|
for p in matches:
|
||||||
ip, port = p.split(':')
|
ip, port = p.split(':')
|
||||||
# Normalize IP (remove leading zeros from octets)
|
# Normalize IP (remove leading zeros from octets)
|
||||||
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
ip = '.'.join(str(int(octet)) for octet in ip.split('.'))
|
||||||
# Normalize port (remove leading zeros, handle empty case)
|
# Normalize port (remove leading zeros, handle empty case)
|
||||||
port = int(port.lstrip('0') or '0')
|
port = int(port.lstrip('0') or '0')
|
||||||
p = '%s:%s' % (ip, port)
|
p = '%s:%s' % (ip, port)
|
||||||
uniques_dict[p] = True
|
uniques_dict[p] = True
|
||||||
|
|
||||||
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
|
uniques = [p for p in uniques_dict.keys() if is_usable_proxy(p)]
|
||||||
|
|
||||||
if not filter_known:
|
if not filter_known:
|
||||||
return uniques
|
return uniques
|
||||||
|
|
||||||
# Initialize known proxies from DB if needed
|
# Initialize known proxies from DB if needed
|
||||||
if proxydb is not None:
|
if proxydb is not None:
|
||||||
init_known_proxies(proxydb)
|
init_known_proxies(proxydb)
|
||||||
|
|
||||||
new = []
|
new = []
|
||||||
for p in uniques:
|
for p in uniques:
|
||||||
if not is_known_proxy(p):
|
if not is_known_proxy(p):
|
||||||
new.append(p)
|
new.append(p)
|
||||||
add_known_proxies([p])
|
add_known_proxies([p])
|
||||||
|
|
||||||
return len(uniques), new
|
return len(uniques), new
|
||||||
|
|
||||||
def extract_urls(content, urls = None, urignore=None):
|
def extract_urls(content, urls = None, urignore=None):
|
||||||
urls = [] if not urls else urls
|
urls = [] if not urls else urls
|
||||||
soup = soupify(content)
|
soup = soupify(content)
|
||||||
for a in soup.body.find_all('a'):
|
for a in soup.body.find_all('a'):
|
||||||
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
||||||
bad = False
|
bad = False
|
||||||
href = a.attrs['href']
|
href = a.attrs['href']
|
||||||
for i in urignore:
|
for i in urignore:
|
||||||
if re.findall(i, href):
|
if re.findall(i, href):
|
||||||
bad = True
|
bad = True
|
||||||
break
|
break
|
||||||
if not bad: urls.append(href)
|
if not bad: urls.append(href)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|||||||
92
mysqlite.py
92
mysqlite.py
@@ -1,44 +1,62 @@
|
|||||||
import time, random, sys
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""SQLite wrapper with retry logic and WAL mode."""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
class mysqlite:
|
|
||||||
def _try_op(self, op, query, args=None, rmin=1.5, rmax=7.0):
|
|
||||||
while 1:
|
|
||||||
try:
|
|
||||||
if query is None:
|
|
||||||
return op()
|
|
||||||
elif args is None:
|
|
||||||
return op(query)
|
|
||||||
else:
|
|
||||||
return op(query, args)
|
|
||||||
except sqlite3.OperationalError as e:
|
|
||||||
if e.message == 'database is locked':
|
|
||||||
print "zzZzzZZ: db is locked (%s)"%self.dbname
|
|
||||||
time.sleep(random.uniform(rmin, rmax))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
print '%s\nquery: %s\nargs: %s' % (str(sys.exc_info()), str(query), str(args))
|
|
||||||
raise e
|
|
||||||
|
|
||||||
def execute(self, query, args = None, rmin=1.5, rmax=7.0):
|
class mysqlite(object):
|
||||||
return self._try_op(self.cursor.execute, query, args, rmin, rmax)
|
"""SQLite connection wrapper with automatic retry on lock."""
|
||||||
|
|
||||||
def executemany(self, query, args, rmin=1.5, rmax=7.0):
|
def __init__(self, database, factory=None):
|
||||||
while len(args):
|
self.handle = sqlite3.connect(database)
|
||||||
self._try_op(self.cursor.executemany, query, args[:500], rmin, rmax)
|
if factory is not None:
|
||||||
args = args[500:]
|
self.handle.text_factory = factory
|
||||||
|
self.cursor = self.handle.cursor()
|
||||||
|
self.dbname = database
|
||||||
|
# Enable WAL mode for better concurrency
|
||||||
|
self.cursor.execute('PRAGMA journal_mode=WAL')
|
||||||
|
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
||||||
|
|
||||||
def commit(self, rmin=1.5, rmax=7.0):
|
def _try_op(self, op, query, args=None, rmin=1.5, rmax=7.0):
|
||||||
return self._try_op(self.handle.commit, None, None, rmin, rmax)
|
"""Execute operation with retry on database lock."""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
if query is None:
|
||||||
|
return op()
|
||||||
|
elif args is None:
|
||||||
|
return op(query)
|
||||||
|
else:
|
||||||
|
return op(query, args)
|
||||||
|
except sqlite3.OperationalError as e:
|
||||||
|
err_msg = str(e)
|
||||||
|
if 'database is locked' in err_msg:
|
||||||
|
sys.stderr.write('zzZzzZZ: db is locked (%s)\n' % self.dbname)
|
||||||
|
time.sleep(random.uniform(rmin, rmax))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
sys.stderr.write('%s\nquery: %s\nargs: %s\n' % (
|
||||||
|
str(sys.exc_info()), str(query), str(args)))
|
||||||
|
raise
|
||||||
|
|
||||||
def close(self):
|
def execute(self, query, args=None, rmin=1.5, rmax=7.0):
|
||||||
self.handle.close()
|
"""Execute a single query with retry."""
|
||||||
|
return self._try_op(self.cursor.execute, query, args, rmin, rmax)
|
||||||
|
|
||||||
def __init__(self, database, factory = None):
|
def executemany(self, query, args, rmin=1.5, rmax=7.0):
|
||||||
self.handle = sqlite3.connect(database)
|
"""Execute query for multiple argument sets, batched."""
|
||||||
if factory: self.handle.text_factory = factory
|
while args:
|
||||||
self.cursor = self.handle.cursor()
|
batch = args[:500]
|
||||||
self.dbname = database
|
self._try_op(self.cursor.executemany, query, batch, rmin, rmax)
|
||||||
# enable WAL mode for better concurrency
|
args = args[500:]
|
||||||
self.cursor.execute('PRAGMA journal_mode=WAL')
|
|
||||||
self.cursor.execute('PRAGMA synchronous=NORMAL')
|
def commit(self, rmin=1.5, rmax=7.0):
|
||||||
|
"""Commit transaction with retry."""
|
||||||
|
return self._try_op(self.handle.commit, None, None, rmin, rmax)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close database connection."""
|
||||||
|
self.handle.close()
|
||||||
|
|||||||
372
ppf.py
Executable file → Normal file
372
ppf.py
Executable file → Normal file
@@ -16,231 +16,231 @@ import random
|
|||||||
config = Config()
|
config = Config()
|
||||||
|
|
||||||
def import_from_file(fn, sqlite):
|
def import_from_file(fn, sqlite):
|
||||||
with open(fn, 'r') as f:
|
with open(fn, 'r') as f:
|
||||||
urls = [ url for url in f.read().split('\n') if url ]
|
urls = [ url for url in f.read().split('\n') if url ]
|
||||||
cinc = 0
|
cinc = 0
|
||||||
while True:
|
while True:
|
||||||
chunk = urls[cinc:cinc+200]
|
chunk = urls[cinc:cinc+200]
|
||||||
if chunk: dbs.insert_urls(chunk, 'import.txt', urldb)
|
if chunk: dbs.insert_urls(chunk, 'import.txt', urldb)
|
||||||
else: break
|
else: break
|
||||||
cinc = cinc + 200
|
cinc = cinc + 200
|
||||||
|
|
||||||
|
|
||||||
def get_content_type(url, proxy):
|
def get_content_type(url, proxy):
|
||||||
hdr = fetch.fetch_contents(url, head=True, proxy=proxy)
|
hdr = fetch.fetch_contents(url, head=True, proxy=proxy)
|
||||||
|
|
||||||
for h in hdr.split('\n'):
|
for h in hdr.split('\n'):
|
||||||
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
|
if h.lower().startswith('content-type: '): return h.lower().split(':')[1].strip()
|
||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def is_good_content_type(string):
|
def is_good_content_type(string):
|
||||||
allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ]
|
allowed_ct = [ 'text/html', 'text/plain', 'atom+xml' ]
|
||||||
for ct in allowed_ct:
|
for ct in allowed_ct:
|
||||||
if ct.lower() in string.lower(): return True
|
if ct.lower() in string.lower(): return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def is_bad_url(uri, domain=None, samedomain=False):
|
def is_bad_url(uri, domain=None, samedomain=False):
|
||||||
# if uri needs to be from same domain and domains missmatch
|
# if uri needs to be from same domain and domains missmatch
|
||||||
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
|
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
|
||||||
return True
|
return True
|
||||||
for u in urignore:
|
for u in urignore:
|
||||||
if re.findall(u, uri): return True
|
if re.findall(u, uri): return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def extract_urls(html, url):
|
def extract_urls(html, url):
|
||||||
mytime = int(time.time())
|
mytime = int(time.time())
|
||||||
proto = url.split(':')[0]
|
proto = url.split(':')[0]
|
||||||
domain = url.split('/')[2]
|
domain = url.split('/')[2]
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
soup = soupify(html, nohtml=True)
|
soup = soupify(html, nohtml=True)
|
||||||
|
|
||||||
for a in soup.find_all('a', href=True):
|
for a in soup.find_all('a', href=True):
|
||||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||||
item = item.strip()
|
item = item.strip()
|
||||||
|
|
||||||
if item.startswith('www.'):
|
if item.startswith('www.'):
|
||||||
item = 'http://%s' % item
|
item = 'http://%s' % item
|
||||||
elif not item.startswith('http'):
|
elif not item.startswith('http'):
|
||||||
if not item.startswith('/'): item = '/%s' % item
|
if not item.startswith('/'): item = '/%s' % item
|
||||||
item = '%s://%s%s' % (proto,domain,item)
|
item = '%s://%s%s' % (proto,domain,item)
|
||||||
|
|
||||||
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
|
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
|
||||||
continue
|
continue
|
||||||
if not item in urls: urls.append(item)
|
if not item in urls: urls.append(item)
|
||||||
|
|
||||||
if urls: dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
if urls: dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
||||||
|
|
||||||
def import_proxies_from_file(proxydb, fn):
|
def import_proxies_from_file(proxydb, fn):
|
||||||
content = open(fn, 'r').read()
|
content = open(fn, 'r').read()
|
||||||
unique_count, new = fetch.extract_proxies(content, proxydb)
|
unique_count, new = fetch.extract_proxies(content, proxydb)
|
||||||
if new:
|
if new:
|
||||||
dbs.insert_proxies(proxydb, new, fn)
|
dbs.insert_proxies(proxydb, new, fn)
|
||||||
return 0
|
return 0
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
class Leechered(threading.Thread):
|
class Leechered(threading.Thread):
|
||||||
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
|
def __init__(self, url, stale_count, error, retrievals, proxies_added, content_type, proxy):
|
||||||
self.status = 'nok'
|
self.status = 'nok'
|
||||||
self.proxylist = []
|
self.proxylist = []
|
||||||
self.running = True
|
self.running = True
|
||||||
self.url = url
|
self.url = url
|
||||||
self.stale_count = stale_count
|
self.stale_count = stale_count
|
||||||
self.error = error
|
self.error = error
|
||||||
self.retrievals = retrievals
|
self.retrievals = retrievals
|
||||||
self.proxies_added = proxies_added
|
self.proxies_added = proxies_added
|
||||||
self.content_type = content_type
|
self.content_type = content_type
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.execute = ''
|
self.execute = ''
|
||||||
threading.Thread.__init__(self)
|
threading.Thread.__init__(self)
|
||||||
|
|
||||||
def retrieve(self):
|
def retrieve(self):
|
||||||
return self.url, self.proxylist, self.stale_count, self.error, self.retrievals, self.content_type, self.proxies_added, self.execute
|
return self.url, self.proxylist, self.stale_count, self.error, self.retrievals, self.content_type, self.proxies_added, self.execute
|
||||||
def status(self):
|
def status(self):
|
||||||
return self.status
|
return self.status
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.status = 'nok'
|
self.status = 'nok'
|
||||||
|
|
||||||
if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
|
if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
|
||||||
|
|
||||||
if is_good_content_type(self.content_type):
|
if is_good_content_type(self.content_type):
|
||||||
try:
|
try:
|
||||||
content = fetch.fetch_contents(self.url, proxy=self.proxy)
|
content = fetch.fetch_contents(self.url, proxy=self.proxy)
|
||||||
except KeyboardInterrupt as e:
|
except KeyboardInterrupt as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_log('%s: fetch error: %s' % (self.url.split('/')[2], str(e)), 'error')
|
_log('%s: fetch error: %s' % (self.url.split('/')[2], str(e)), 'error')
|
||||||
content = ''
|
content = ''
|
||||||
else:
|
else:
|
||||||
content = ''
|
content = ''
|
||||||
|
|
||||||
unique = fetch.extract_proxies(content, filter_known=False)
|
unique = fetch.extract_proxies(content, filter_known=False)
|
||||||
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
self.proxylist = [ proxy for proxy in unique if not fetch.is_known_proxy(proxy) ]
|
||||||
proxy_count = len(self.proxylist)
|
proxy_count = len(self.proxylist)
|
||||||
|
|
||||||
if self.retrievals == 0: # new site
|
if self.retrievals == 0: # new site
|
||||||
if content and not self.proxylist: # site works but has zero proxy addresses
|
if content and not self.proxylist: # site works but has zero proxy addresses
|
||||||
self.error += 1
|
self.error += 1
|
||||||
self.stale_count += 1
|
self.stale_count += 1
|
||||||
elif proxy_count:
|
elif proxy_count:
|
||||||
self.error = 0
|
self.error = 0
|
||||||
self.stale_count = 0
|
self.stale_count = 0
|
||||||
else:
|
else:
|
||||||
self.error += 2
|
self.error += 2
|
||||||
self.stale_count += 2
|
self.stale_count += 2
|
||||||
else: # not a new site
|
else: # not a new site
|
||||||
# proxylist is empty
|
# proxylist is empty
|
||||||
if not proxy_count:
|
if not proxy_count:
|
||||||
self.stale_count += 1
|
self.stale_count += 1
|
||||||
# proxylist is not empty: site is working
|
# proxylist is not empty: site is working
|
||||||
else:
|
else:
|
||||||
self.stale_count = 0
|
self.stale_count = 0
|
||||||
self.error = 0
|
self.error = 0
|
||||||
# site has no content
|
# site has no content
|
||||||
if not content:
|
if not content:
|
||||||
self.error += 1
|
self.error += 1
|
||||||
self.stale_count += 1
|
self.stale_count += 1
|
||||||
#else:
|
#else:
|
||||||
# self.retrievals += 1
|
# self.retrievals += 1
|
||||||
# self.error = 0
|
# self.error = 0
|
||||||
# self.stale_count = 0
|
# self.stale_count = 0
|
||||||
# site has proxies
|
# site has proxies
|
||||||
if proxy_count:
|
if proxy_count:
|
||||||
self.error = 0
|
self.error = 0
|
||||||
self.stale_count = 0
|
self.stale_count = 0
|
||||||
extract_urls(content, self.url)
|
extract_urls(content, self.url)
|
||||||
|
|
||||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
|
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
|
||||||
self.status = 'ok'
|
self.status = 'ok'
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
config.load()
|
config.load()
|
||||||
errors = config.validate()
|
errors = config.validate()
|
||||||
if errors:
|
if errors:
|
||||||
for e in errors:
|
for e in errors:
|
||||||
_log(e, 'error')
|
_log(e, 'error')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
fetch.set_config(config)
|
fetch.set_config(config)
|
||||||
|
|
||||||
# handle --nobs flag
|
# handle --nobs flag
|
||||||
args = config.aparser.parse_args()
|
args = config.aparser.parse_args()
|
||||||
if args.nobs:
|
if args.nobs:
|
||||||
set_nobs(True)
|
set_nobs(True)
|
||||||
|
|
||||||
|
|
||||||
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
||||||
dbs.create_table_if_not_exists(proxydb, 'proxylist')
|
dbs.create_table_if_not_exists(proxydb, 'proxylist')
|
||||||
fetch.init_known_proxies(proxydb)
|
fetch.init_known_proxies(proxydb)
|
||||||
|
|
||||||
with open('urignore.txt', 'r') as f:
|
with open('urignore.txt', 'r') as f:
|
||||||
urignore = [ i.strip() for i in f.read().split('\n') if i.strip() ]
|
urignore = [ i.strip() for i in f.read().split('\n') if i.strip() ]
|
||||||
|
|
||||||
urldb = mysqlite.mysqlite(config.ppf.database, str)
|
urldb = mysqlite.mysqlite(config.ppf.database, str)
|
||||||
dbs.create_table_if_not_exists(urldb, 'uris')
|
dbs.create_table_if_not_exists(urldb, 'uris')
|
||||||
import_from_file('import.txt', urldb)
|
import_from_file('import.txt', urldb)
|
||||||
if len(sys.argv) == 3 and sys.argv[1] == "--file":
|
if len(sys.argv) == 3 and sys.argv[1] == "--file":
|
||||||
sys.exit(import_proxies_from_file(proxydb, sys.argv[2]))
|
sys.exit(import_proxies_from_file(proxydb, sys.argv[2]))
|
||||||
|
|
||||||
# start proxy watcher
|
# start proxy watcher
|
||||||
if config.watchd.threads > 0:
|
if config.watchd.threads > 0:
|
||||||
watcherd = proxywatchd.Proxywatchd()
|
watcherd = proxywatchd.Proxywatchd()
|
||||||
watcherd.start()
|
watcherd.start()
|
||||||
else:
|
else:
|
||||||
watcherd = None
|
watcherd = None
|
||||||
|
|
||||||
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
|
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
|
||||||
threads = []
|
threads = []
|
||||||
rows = []
|
rows = []
|
||||||
reqtime = time.time() - 3600
|
reqtime = time.time() - 3600
|
||||||
statusmsg = time.time()
|
statusmsg = time.time()
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
time.sleep(random.random()/10)
|
time.sleep(random.random()/10)
|
||||||
if (time.time() - statusmsg) > 180:
|
if (time.time() - statusmsg) > 180:
|
||||||
_log('running %d thread(s) over %d' % (len(threads), config.ppf.threads), 'ppf')
|
_log('running %d thread(s) over %d' % (len(threads), config.ppf.threads), 'ppf')
|
||||||
statusmsg = time.time()
|
statusmsg = time.time()
|
||||||
if not rows:
|
if not rows:
|
||||||
if (time.time() - reqtime) > 3:
|
if (time.time() - reqtime) > 3:
|
||||||
rows = urldb.execute(qurl, (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
|
rows = urldb.execute(qurl, (config.ppf.max_fail, config.ppf.checktime, config.ppf.perfail_checktime, int(time.time()))).fetchall()
|
||||||
reqtime = time.time()
|
reqtime = time.time()
|
||||||
if len(rows) < config.ppf.threads:
|
if len(rows) < config.ppf.threads:
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
rows = []
|
rows = []
|
||||||
else:
|
else:
|
||||||
_log('handing %d job(s) to %d thread(s)' % ( len(rows), config.ppf.threads ), 'ppf')
|
_log('handing %d job(s) to %d thread(s)' % ( len(rows), config.ppf.threads ), 'ppf')
|
||||||
|
|
||||||
_proxylist = [ '%s://%s' % (p[0], p[1]) for p in proxydb.execute('SELECT proto,proxy from proxylist where failed=0').fetchall() ]
|
_proxylist = [ '%s://%s' % (p[0], p[1]) for p in proxydb.execute('SELECT proto,proxy from proxylist where failed=0').fetchall() ]
|
||||||
if not _proxylist: _proxylist = None
|
if not _proxylist: _proxylist = None
|
||||||
|
|
||||||
for thread in threads:
|
for thread in threads:
|
||||||
if thread.status == 'ok':
|
if thread.status == 'ok':
|
||||||
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
|
url, proxylist, stale_count, error, retrievals, content_type, proxies_added, execute = thread.retrieve()
|
||||||
new = [ p for p in proxylist if not fetch.is_known_proxy(p) ]
|
new = [ p for p in proxylist if not fetch.is_known_proxy(p) ]
|
||||||
if new:
|
if new:
|
||||||
fetch.add_known_proxies(new)
|
fetch.add_known_proxies(new)
|
||||||
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)
|
execute = (error, stale_count, int(time.time()), retrievals, proxies_added+len(new), content_type, url)
|
||||||
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', execute)
|
urldb.execute('UPDATE uris SET error=?,stale_count=?,check_time=?,retrievals=?,proxies_added=?,content_type=? where url=?', execute)
|
||||||
urldb.commit()
|
urldb.commit()
|
||||||
if new: dbs.insert_proxies(proxydb, new, url)
|
if new: dbs.insert_proxies(proxydb, new, url)
|
||||||
|
|
||||||
threads = [ thread for thread in threads if thread.is_alive() ]
|
threads = [ thread for thread in threads if thread.is_alive() ]
|
||||||
if len(threads) < config.ppf.threads and rows:
|
if len(threads) < config.ppf.threads and rows:
|
||||||
p = random.sample(_proxylist, 5) if _proxylist is not None else None
|
p = random.sample(_proxylist, 5) if _proxylist is not None else None
|
||||||
row = random.choice(rows)
|
row = random.choice(rows)
|
||||||
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
|
urldb.execute('UPDATE uris SET check_time=? where url=?', (time.time(), row[0]))
|
||||||
urldb.commit()
|
urldb.commit()
|
||||||
rows.remove(row)
|
rows.remove(row)
|
||||||
t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], p)
|
t = Leechered(row[0], row[1], row[2], row[3], row[4], row[5], p)
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
t.start()
|
t.start()
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
if watcherd:
|
if watcherd:
|
||||||
watcherd.stop()
|
watcherd.stop()
|
||||||
watcherd.finish()
|
watcherd.finish()
|
||||||
break
|
break
|
||||||
|
|
||||||
_log('ppf stopped', 'info')
|
_log('ppf stopped', 'info')
|
||||||
|
|||||||
1377
proxywatchd.py
1377
proxywatchd.py
File diff suppressed because it is too large
Load Diff
126
soup_parser.py
126
soup_parser.py
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# HTML parsing with optional BeautifulSoup or stdlib fallback
|
# -*- coding: utf-8 -*-
|
||||||
|
"""HTML parsing with optional BeautifulSoup or stdlib fallback."""
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
import sys
|
import sys
|
||||||
@@ -8,87 +9,98 @@ _bs4_available = False
|
|||||||
_use_bs4 = True
|
_use_bs4 = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup, FeatureNotFound
|
from bs4 import BeautifulSoup, FeatureNotFound
|
||||||
_bs4_available = True
|
_bs4_available = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_bs4_available = False
|
_bs4_available = False
|
||||||
|
|
||||||
|
|
||||||
class Tag():
|
class Tag(object):
|
||||||
def __init__(self, name, attrs):
|
"""Simple tag representation for stdlib parser."""
|
||||||
self.name = name
|
|
||||||
self.attrs = dict(attrs)
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __init__(self, name, attrs):
|
||||||
return self.attrs.get(key)
|
self.name = name
|
||||||
|
self.attrs = dict(attrs)
|
||||||
|
|
||||||
def get(self, key, default=None):
|
def __getitem__(self, key):
|
||||||
return self.attrs.get(key, default)
|
return self.attrs.get(key)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
return self.attrs.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
class SoupResult():
|
class SoupResult(object):
|
||||||
def __init__(self, tags):
|
"""BeautifulSoup-like result wrapper for stdlib parser."""
|
||||||
self._tags = tags
|
|
||||||
self.body = self
|
|
||||||
|
|
||||||
def find_all(self, tag_name, **kwargs):
|
def __init__(self, tags):
|
||||||
results = []
|
self._tags = tags
|
||||||
for tag in self._tags:
|
self.body = self
|
||||||
if tag.name != tag_name:
|
|
||||||
continue
|
def find_all(self, tag_name, **kwargs):
|
||||||
if 'href' in kwargs:
|
"""Find all tags matching criteria."""
|
||||||
if kwargs['href'] is True and 'href' not in tag.attrs:
|
results = []
|
||||||
continue
|
for tag in self._tags:
|
||||||
elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
|
if tag.name != tag_name:
|
||||||
continue
|
continue
|
||||||
results.append(tag)
|
if 'href' in kwargs:
|
||||||
return results
|
if kwargs['href'] is True and 'href' not in tag.attrs:
|
||||||
|
continue
|
||||||
|
elif kwargs['href'] is not True and tag.attrs.get('href') != kwargs['href']:
|
||||||
|
continue
|
||||||
|
results.append(tag)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
class LinkExtractor(HTMLParser):
|
class LinkExtractor(HTMLParser):
|
||||||
def __init__(self):
|
"""Extract tags from HTML using stdlib."""
|
||||||
HTMLParser.__init__(self)
|
|
||||||
self.tags = []
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def __init__(self):
|
||||||
self.tags.append(Tag(tag, attrs))
|
HTMLParser.__init__(self)
|
||||||
|
self.tags = []
|
||||||
|
|
||||||
def handle_startendtag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
self.tags.append(Tag(tag, attrs))
|
self.tags.append(Tag(tag, attrs))
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
self.tags.append(Tag(tag, attrs))
|
||||||
|
|
||||||
|
|
||||||
def _parse_stdlib(html):
|
def _parse_stdlib(html):
|
||||||
parser = LinkExtractor()
|
"""Parse HTML using stdlib HTMLParser."""
|
||||||
try:
|
parser = LinkExtractor()
|
||||||
parser.feed(html)
|
try:
|
||||||
except Exception:
|
parser.feed(html)
|
||||||
pass # malformed HTML, return partial results
|
except Exception:
|
||||||
return SoupResult(parser.tags)
|
pass # Malformed HTML, return partial results
|
||||||
|
return SoupResult(parser.tags)
|
||||||
|
|
||||||
|
|
||||||
def _parse_bs4(html):
|
def _parse_bs4(html):
|
||||||
try:
|
"""Parse HTML using BeautifulSoup."""
|
||||||
return BeautifulSoup(html, 'lxml')
|
try:
|
||||||
except (FeatureNotFound, Exception):
|
return BeautifulSoup(html, 'lxml')
|
||||||
return BeautifulSoup(html, 'html.parser')
|
except (FeatureNotFound, Exception):
|
||||||
|
return BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
def set_nobs(enabled):
|
def set_nobs(enabled):
|
||||||
global _use_bs4
|
"""Disable BeautifulSoup and use stdlib instead."""
|
||||||
_use_bs4 = not enabled
|
global _use_bs4
|
||||||
if enabled and _bs4_available:
|
_use_bs4 = not enabled
|
||||||
sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
|
if enabled and _bs4_available:
|
||||||
elif not _bs4_available:
|
sys.stderr.write('info: --nobs: using stdlib HTMLParser\n')
|
||||||
sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
|
elif not _bs4_available:
|
||||||
|
sys.stderr.write('info: bs4 not available, using stdlib HTMLParser\n')
|
||||||
|
|
||||||
|
|
||||||
def soupify(html, nohtml=False):
|
def soupify(html, nohtml=False):
|
||||||
htm = html if nohtml else '<html><body>%s</body></html>' % (html)
|
"""Parse HTML content, returning BeautifulSoup-like object."""
|
||||||
if _use_bs4 and _bs4_available:
|
htm = html if nohtml else '<html><body>%s</body></html>' % html
|
||||||
return _parse_bs4(htm)
|
if _use_bs4 and _bs4_available:
|
||||||
else:
|
return _parse_bs4(htm)
|
||||||
return _parse_stdlib(htm)
|
return _parse_stdlib(htm)
|
||||||
|
|
||||||
|
|
||||||
def is_available():
|
def is_available():
|
||||||
return _bs4_available
|
"""Check if BeautifulSoup is available."""
|
||||||
|
return _bs4_available
|
||||||
|
|||||||
Reference in New Issue
Block a user