scraper: integrate multi-lingual search terms
- Use translations module for 70% non-English search terms - Initialize translations config on startup - Add engines module for multi-engine support
This commit is contained in:
367
scraper.py
367
scraper.py
@@ -1,94 +1,337 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Multi-engine proxy list scraper."""
|
||||
|
||||
import dbs
|
||||
import random, time
|
||||
import random
|
||||
import time
|
||||
import urllib
|
||||
import mysqlite
|
||||
import proxywatchd
|
||||
from misc import _log
|
||||
from config import Config
|
||||
import fetch
|
||||
import sys
|
||||
import engines
|
||||
import translations
|
||||
import os
|
||||
|
||||
config = Config()
|
||||
|
||||
with open('searx.instances') as h:
|
||||
searx_instances = [ line.strip() for line in h.readlines() if line.lower().startswith('http') ]
|
||||
# Load Searx instances if file exists
|
||||
searx_instances = []
|
||||
if os.path.exists('searx.instances'):
|
||||
with open('searx.instances') as h:
|
||||
searx_instances = [line.strip() for line in h.readlines()
|
||||
if line.lower().startswith('http')]
|
||||
|
||||
def proxyfind(sqlite = None, urignore=None):
|
||||
search = ''
|
||||
random.shuffle(searx_instances)
|
||||
|
||||
## search by working proxy
|
||||
if 'p' in config.scraper.query:
|
||||
proxydb = mysqlite.mysqlite(config.watchd.database,str)
|
||||
proxies = [ i[0] for i in proxydb.execute('SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10').fetchall() ]
|
||||
if len(proxies) and random.random() < random.random():
|
||||
search = ' '.join( random.sample(proxies, random.randint(1,2)))
|
||||
class InstanceTracker(object):
|
||||
"""Track instance health with exponential backoff."""
|
||||
|
||||
## search by relative url
|
||||
if 'w' in config.scraper.query and not len(search) or random.random() < random.random():
|
||||
if not sqlite: sqlite = mysqlite.mysqlite(config.ppf.database,str)
|
||||
uris = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE error=0 and url not like "%github%" ORDER BY RANDOM() LIMIT 10').fetchall() ]
|
||||
if len(uris) > 0 and random.random() < random.random():
|
||||
if len(search): search = '%s OR ' % search
|
||||
search = search + 'site:%s' % random.choice(uris).split('/')[2]
|
||||
def __init__(self, instances, base_delay=30, max_delay=3600):
|
||||
self.instances = list(instances)
|
||||
self.base_delay = base_delay
|
||||
self.max_delay = max_delay
|
||||
self.failures = {}
|
||||
self.backoff_until = {}
|
||||
self.success_count = {}
|
||||
|
||||
## build string
|
||||
if 's' in config.scraper.query and not len(search) or random.random() < random.random():
|
||||
if len(search): search = '%s OR ' % search
|
||||
search = search + random.choice(search_terms)
|
||||
def get_available(self):
|
||||
"""Return instances not currently in backoff."""
|
||||
now = time.time()
|
||||
available = []
|
||||
for inst in self.instances:
|
||||
if inst not in self.backoff_until or now >= self.backoff_until[inst]:
|
||||
available.append(inst)
|
||||
return available
|
||||
|
||||
if not len(search): return
|
||||
#search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week','month','year']), 'q=%s' % urllib.quote_plus(search) ]
|
||||
search_args = [ 'category=general', 'time_range=%s' % random.choice(['day','week']), 'q=%s' % urllib.quote_plus(search) ]
|
||||
random.shuffle(search_args)
|
||||
search_arg = '&'.join(search_args)
|
||||
def mark_success(self, instance):
|
||||
"""Reset failure count on success."""
|
||||
self.failures[instance] = 0
|
||||
self.success_count[instance] = self.success_count.get(instance, 0) + 1
|
||||
if instance in self.backoff_until:
|
||||
del self.backoff_until[instance]
|
||||
|
||||
if config.scraper.debug:
|
||||
print('search_arg: %s' % search_arg)
|
||||
def mark_failure(self, instance):
|
||||
"""Increment failure count and set exponential backoff."""
|
||||
count = self.failures.get(instance, 0) + 1
|
||||
self.failures[instance] = count
|
||||
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
|
||||
self.backoff_until[instance] = time.time() + delay
|
||||
name = instance.split('/')[2] if '/' in instance else instance
|
||||
_log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate')
|
||||
return delay
|
||||
|
||||
for srx in searx_instances:
|
||||
x = 0
|
||||
while 1:
|
||||
urls = []
|
||||
if x > 0: content = fetch.fetch_contents('%s/?%s&pageno=%d' % (srx,search_arg,x))
|
||||
else: content = fetch.fetch_contents('%s/?%s' % (srx,search_arg))
|
||||
if content: urls = fetch.extract_urls(content, urls, urignore)
|
||||
def get_status(self):
|
||||
"""Return status summary."""
|
||||
available = len(self.get_available())
|
||||
in_backoff = len(self.instances) - available
|
||||
return available, in_backoff, len(self.instances)
|
||||
|
||||
if not len(urls): break
|
||||
dbs.insert_urls(urls, '%s/?%s (pageno: %d)' % (srx.split('/')[2],search_arg,x) , sqlite)
|
||||
x = x + 1
|
||||
|
||||
class EngineTracker(object):
|
||||
"""Track multiple search engine instances with rate limiting."""
|
||||
|
||||
def __init__(self, engine_names, searx_urls, base_delay=30, max_delay=3600):
|
||||
self.base_delay = base_delay
|
||||
self.max_delay = max_delay
|
||||
self.failures = {}
|
||||
self.backoff_until = {}
|
||||
self.success_count = {}
|
||||
|
||||
# Build list of (engine_instance, identifier)
|
||||
self.engines = []
|
||||
for name in engine_names:
|
||||
name = name.strip().lower()
|
||||
if name == 'searx':
|
||||
for url in searx_urls:
|
||||
eng = engines.Searx(url)
|
||||
self.engines.append((eng, url))
|
||||
elif name in engines.ENGINES:
|
||||
eng = engines.get_engine(name)
|
||||
self.engines.append((eng, name))
|
||||
else:
|
||||
_log('unknown engine: %s' % name, 'warn')
|
||||
|
||||
def get_available(self):
|
||||
"""Return engines not currently in backoff."""
|
||||
now = time.time()
|
||||
available = []
|
||||
for eng, ident in self.engines:
|
||||
if ident not in self.backoff_until or now >= self.backoff_until[ident]:
|
||||
available.append((eng, ident))
|
||||
return available
|
||||
|
||||
def mark_success(self, ident):
|
||||
"""Reset failure count on success."""
|
||||
self.failures[ident] = 0
|
||||
self.success_count[ident] = self.success_count.get(ident, 0) + 1
|
||||
if ident in self.backoff_until:
|
||||
del self.backoff_until[ident]
|
||||
|
||||
def mark_failure(self, ident):
|
||||
"""Increment failure count and set exponential backoff."""
|
||||
count = self.failures.get(ident, 0) + 1
|
||||
self.failures[ident] = count
|
||||
delay = min(self.base_delay * (2 ** (count - 1)), self.max_delay)
|
||||
self.backoff_until[ident] = time.time() + delay
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
_log('%s: backoff %ds (failures: %d)' % (name, delay, count), 'rate')
|
||||
return delay
|
||||
|
||||
def get_status(self):
|
||||
"""Return status summary."""
|
||||
available = len(self.get_available())
|
||||
in_backoff = len(self.engines) - available
|
||||
return available, in_backoff, len(self.engines)
|
||||
|
||||
|
||||
engine_tracker = None
|
||||
|
||||
|
||||
def build_search_query(sqlite=None):
|
||||
"""Build a search query using configured sources."""
|
||||
search = ''
|
||||
|
||||
# Search by working proxy
|
||||
if 'p' in config.scraper.query:
|
||||
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
||||
proxies = [i[0] for i in proxydb.execute(
|
||||
'SELECT proxy FROM proxylist WHERE failed=0 ORDER BY RANDOM() LIMIT 10'
|
||||
).fetchall()]
|
||||
if proxies and random.random() < 0.5:
|
||||
search = ' '.join(random.sample(proxies, random.randint(1, 2)))
|
||||
|
||||
# Search by known website
|
||||
if ('w' in config.scraper.query and not search) or random.random() < 0.5:
|
||||
if sqlite is None:
|
||||
sqlite = mysqlite.mysqlite(config.ppf.database, str)
|
||||
uris = [i[0] for i in sqlite.execute(
|
||||
'SELECT url FROM uris WHERE error=0 AND url NOT LIKE "%github%" ORDER BY RANDOM() LIMIT 10'
|
||||
).fetchall()]
|
||||
if uris and random.random() < 0.5:
|
||||
if search:
|
||||
search = '%s OR ' % search
|
||||
search = search + 'site:%s' % random.choice(uris).split('/')[2]
|
||||
|
||||
# Search by term (multi-lingual)
|
||||
if ('s' in config.scraper.query and not search) or random.random() < 0.5:
|
||||
if search:
|
||||
search = '%s OR ' % search
|
||||
# 70% chance of non-English term
|
||||
if random.random() < 0.7:
|
||||
term = translations.get_random_search_term()
|
||||
else:
|
||||
term = random.choice(search_terms)
|
||||
search = search + term
|
||||
|
||||
return search
|
||||
|
||||
|
||||
def scrape_engine(engine, ident, query, urignore, sqlite):
|
||||
"""Scrape a single engine for proxy list URLs."""
|
||||
max_pages = config.scraper.max_pages
|
||||
consecutive_empty = 0
|
||||
total_urls = 0
|
||||
|
||||
for page in range(max_pages):
|
||||
try:
|
||||
url = engine.build_url(query, page)
|
||||
|
||||
if config.scraper.debug:
|
||||
_log('%s page %d: %s' % (engine.name, page, url), 'debug')
|
||||
|
||||
content = fetch.fetch_contents(url)
|
||||
|
||||
# Check for rate limiting
|
||||
if engine.is_rate_limited(content):
|
||||
_log('%s: rate limited' % engine.name, 'rate')
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
|
||||
if not content:
|
||||
consecutive_empty += 1
|
||||
if consecutive_empty >= config.scraper.fail_threshold:
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
continue
|
||||
|
||||
# Extract URLs
|
||||
urls = engine.extract_urls(content, urignore)
|
||||
|
||||
if not urls:
|
||||
# Empty results on first page likely means rate limited
|
||||
if page == 0:
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
|
||||
# Success
|
||||
engine_tracker.mark_success(ident)
|
||||
consecutive_empty = 0
|
||||
|
||||
# Deduplicate and insert
|
||||
urls = list(set(urls))
|
||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||
dbs.insert_urls(urls, source, sqlite)
|
||||
total_urls += len(urls)
|
||||
|
||||
# Small delay between pages
|
||||
time.sleep(random.uniform(1.0, 3.0))
|
||||
|
||||
except Exception as e:
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
_log('%s: error: %s' % (name, str(e)), 'error')
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
|
||||
return total_urls
|
||||
|
||||
|
||||
def proxyfind(sqlite=None, urignore=None):
|
||||
"""Find proxy list URLs using available search engines."""
|
||||
global engine_tracker
|
||||
|
||||
# Get available engines
|
||||
available = engine_tracker.get_available()
|
||||
if not available:
|
||||
avail, backoff, total = engine_tracker.get_status()
|
||||
_log('all %d engines in backoff, sleeping 60s' % total, 'rate')
|
||||
time.sleep(60)
|
||||
return
|
||||
|
||||
# Build search query
|
||||
query = build_search_query(sqlite)
|
||||
if not query:
|
||||
return
|
||||
|
||||
if config.scraper.debug:
|
||||
_log('query: %s' % query, 'debug')
|
||||
|
||||
# Shuffle and pick engines
|
||||
random.shuffle(available)
|
||||
|
||||
# Use 1-3 engines per round
|
||||
num_engines = min(len(available), random.randint(1, 3))
|
||||
|
||||
for engine, ident in available[:num_engines]:
|
||||
total = scrape_engine(engine, ident, query, urignore, sqlite)
|
||||
if total > 0:
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
_log('%s: found %d URLs' % (name, total), 'scraper')
|
||||
|
||||
# Delay between engines
|
||||
time.sleep(random.uniform(2.0, 5.0))
|
||||
|
||||
|
||||
def load_urignore():
|
||||
## load bad terms
|
||||
with open('urignore.txt', 'r') as f:
|
||||
urignore = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
||||
## add searx instances as bad terms (avoid loops)
|
||||
for i in searx_instances:
|
||||
urignore.append(i.split('/')[2])
|
||||
return urignore
|
||||
"""Load URL ignore patterns."""
|
||||
urignore = []
|
||||
|
||||
# Load from file
|
||||
if os.path.exists('urignore.txt'):
|
||||
with open('urignore.txt', 'r') as f:
|
||||
urignore = [i.strip() for i in f.read().split('\n') if i.strip()]
|
||||
|
||||
# Add Searx instances to ignore (avoid loops)
|
||||
for i in searx_instances:
|
||||
urignore.append(i.split('/')[2])
|
||||
|
||||
# Add search engine domains to ignore
|
||||
ignore_domains = [
|
||||
'duckduckgo.com', 'startpage.com', 'mojeek.com', 'qwant.com',
|
||||
'yandex.com', 'yandex.ru', 'ecosia.org', 'brave.com',
|
||||
'google.com', 'bing.com', 'yahoo.com',
|
||||
]
|
||||
for domain in ignore_domains:
|
||||
urignore.append(domain)
|
||||
|
||||
return urignore
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
config.load()
|
||||
fetch.set_config(config)
|
||||
config.load()
|
||||
errors = config.validate()
|
||||
if errors:
|
||||
for e in errors:
|
||||
_log(e, 'error')
|
||||
import sys
|
||||
sys.exit(1)
|
||||
fetch.set_config(config)
|
||||
translations.set_config(config)
|
||||
|
||||
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
||||
dbs.create_table_if_not_exists(proxydb, 'proxylist')
|
||||
proxydb = mysqlite.mysqlite(config.watchd.database, str)
|
||||
dbs.create_table_if_not_exists(proxydb, 'proxylist')
|
||||
|
||||
urldb = mysqlite.mysqlite(config.ppf.database, str)
|
||||
dbs.create_table_if_not_exists(urldb, 'uris')
|
||||
urldb = mysqlite.mysqlite(config.ppf.database, str)
|
||||
dbs.create_table_if_not_exists(urldb, 'uris')
|
||||
|
||||
## load search terms
|
||||
with open('search_terms.txt', 'r') as f:
|
||||
search_terms = [ i.strip() for i in f.read().split('\n') if len(i.strip()) ]
|
||||
# Load search terms
|
||||
search_terms = ['free proxy list', 'socks5 proxy', 'http proxy']
|
||||
if os.path.exists('search_terms.txt'):
|
||||
with open('search_terms.txt', 'r') as f:
|
||||
search_terms = [i.strip() for i in f.read().split('\n') if i.strip()]
|
||||
|
||||
urignore = load_urignore()
|
||||
urignore = load_urignore()
|
||||
|
||||
while True:
|
||||
try: proxyfind(urldb, urignore)
|
||||
except KeyboardInterrupt: break
|
||||
# Parse enabled engines from config
|
||||
enabled_engines = [e.strip() for e in config.scraper.engines.split(',')]
|
||||
|
||||
print '\r',
|
||||
# Initialize engine tracker
|
||||
engine_tracker = EngineTracker(
|
||||
enabled_engines,
|
||||
searx_instances,
|
||||
base_delay=config.scraper.backoff_base,
|
||||
max_delay=config.scraper.backoff_max
|
||||
)
|
||||
|
||||
avail, backoff, total = engine_tracker.get_status()
|
||||
_log('loaded %d engine instances (%s)' % (total, ', '.join(enabled_engines)), 'info')
|
||||
|
||||
try:
|
||||
while True:
|
||||
proxyfind(urldb, urignore)
|
||||
# Small delay between rounds
|
||||
time.sleep(random.uniform(5.0, 15.0))
|
||||
except KeyboardInterrupt:
|
||||
avail, backoff, total = engine_tracker.get_status()
|
||||
_log('scraper stopped (engines: %d/%d available)' % (avail, total), 'info')
|
||||
|
||||
Reference in New Issue
Block a user