scraper: fix return values and add stats logging
- Return counts from proxyfind for tracking - Add 15-minute stats interval logging - Track new_urls and queries count
This commit is contained in:
41
scraper.py
41
scraper.py
@@ -300,8 +300,8 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
|
|||||||
# Deduplicate and insert
|
# Deduplicate and insert
|
||||||
urls = list(set(urls))
|
urls = list(set(urls))
|
||||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||||
dbs.insert_urls(urls, source, sqlite)
|
new_count = dbs.insert_urls(urls, source, sqlite)
|
||||||
total_urls += len(urls)
|
total_urls += new_count
|
||||||
|
|
||||||
# Small delay between pages
|
# Small delay between pages
|
||||||
time.sleep(random.uniform(1.0, 3.0))
|
time.sleep(random.uniform(1.0, 3.0))
|
||||||
@@ -323,12 +323,12 @@ def proxyfind(sqlite=None, urignore=None):
|
|||||||
avail, backoff, total = engine_tracker.get_status()
|
avail, backoff, total = engine_tracker.get_status()
|
||||||
_log('all %d engines in backoff, sleeping 60s' % total, 'rate')
|
_log('all %d engines in backoff, sleeping 60s' % total, 'rate')
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
return
|
return 0
|
||||||
|
|
||||||
# Build search query
|
# Build search query
|
||||||
query = build_search_query(sqlite)
|
query = build_search_query(sqlite)
|
||||||
if not query:
|
if not query:
|
||||||
return
|
return 0
|
||||||
|
|
||||||
if config.scraper.debug:
|
if config.scraper.debug:
|
||||||
_log('query: %s' % query, 'debug')
|
_log('query: %s' % query, 'debug')
|
||||||
@@ -339,15 +339,15 @@ def proxyfind(sqlite=None, urignore=None):
|
|||||||
# Use 1-3 engines per round
|
# Use 1-3 engines per round
|
||||||
num_engines = min(len(available), random.randint(1, 3))
|
num_engines = min(len(available), random.randint(1, 3))
|
||||||
|
|
||||||
|
total_new = 0
|
||||||
for engine, ident in available[:num_engines]:
|
for engine, ident in available[:num_engines]:
|
||||||
total = scrape_engine(engine, ident, query, urignore, sqlite)
|
new_urls = scrape_engine(engine, ident, query, urignore, sqlite)
|
||||||
if total > 0:
|
total_new += new_urls
|
||||||
name = ident.split('/')[2] if '/' in ident else ident
|
|
||||||
_log('%s: found %d URLs' % (name, total), 'scraper')
|
|
||||||
|
|
||||||
# Delay between engines
|
# Delay between engines
|
||||||
time.sleep(random.uniform(2.0, 5.0))
|
time.sleep(random.uniform(2.0, 5.0))
|
||||||
|
|
||||||
|
return total_new
|
||||||
|
|
||||||
|
|
||||||
def load_urignore():
|
def load_urignore():
|
||||||
"""Load URL ignore patterns."""
|
"""Load URL ignore patterns."""
|
||||||
@@ -381,9 +381,25 @@ class Scraper(threading.Thread):
|
|||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.running = False
|
self.running = False
|
||||||
self.urignore = load_urignore()
|
self.urignore = load_urignore()
|
||||||
|
# Stats tracking
|
||||||
|
self.stats_interval = 900 # 15 minutes
|
||||||
|
self.last_stats = 0
|
||||||
|
self.new_urls = 0
|
||||||
|
self.queries = 0
|
||||||
threading.Thread.__init__(self)
|
threading.Thread.__init__(self)
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
|
|
||||||
|
def log_stats(self):
|
||||||
|
"""Log accumulated stats every 15 minutes."""
|
||||||
|
now = time.time()
|
||||||
|
if now - self.last_stats >= self.stats_interval:
|
||||||
|
avail, backoff, total = engine_tracker.get_status()
|
||||||
|
_log('new=%d queries=%d engines=%d/%d' % (
|
||||||
|
self.new_urls, self.queries, avail, total), 'scraper')
|
||||||
|
self.new_urls = 0
|
||||||
|
self.queries = 0
|
||||||
|
self.last_stats = now
|
||||||
|
|
||||||
def init_tracker(self):
|
def init_tracker(self):
|
||||||
"""Initialize engine tracker with configured engines."""
|
"""Initialize engine tracker with configured engines."""
|
||||||
global engine_tracker
|
global engine_tracker
|
||||||
@@ -414,9 +430,14 @@ class Scraper(threading.Thread):
|
|||||||
# Create thread-local database connection
|
# Create thread-local database connection
|
||||||
urldb = mysqlite.mysqlite(self.cfg.ppf.database, str)
|
urldb = mysqlite.mysqlite(self.cfg.ppf.database, str)
|
||||||
|
|
||||||
|
self.last_stats = time.time()
|
||||||
|
|
||||||
while self.running:
|
while self.running:
|
||||||
try:
|
try:
|
||||||
proxyfind(urldb, self.urignore)
|
new_count = proxyfind(urldb, self.urignore)
|
||||||
|
self.new_urls += new_count
|
||||||
|
self.queries += 1
|
||||||
|
self.log_stats()
|
||||||
time.sleep(random.uniform(5.0, 15.0))
|
time.sleep(random.uniform(5.0, 15.0))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user