scraper: fix return values and add stats logging
- Return counts from proxyfind for tracking - Add 15-minute stats interval logging - Track new_urls and queries count
This commit is contained in:
41
scraper.py
41
scraper.py
@@ -300,8 +300,8 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
|
||||
# Deduplicate and insert
|
||||
urls = list(set(urls))
|
||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||
dbs.insert_urls(urls, source, sqlite)
|
||||
total_urls += len(urls)
|
||||
new_count = dbs.insert_urls(urls, source, sqlite)
|
||||
total_urls += new_count
|
||||
|
||||
# Small delay between pages
|
||||
time.sleep(random.uniform(1.0, 3.0))
|
||||
@@ -323,12 +323,12 @@ def proxyfind(sqlite=None, urignore=None):
|
||||
avail, backoff, total = engine_tracker.get_status()
|
||||
_log('all %d engines in backoff, sleeping 60s' % total, 'rate')
|
||||
time.sleep(60)
|
||||
return
|
||||
return 0
|
||||
|
||||
# Build search query
|
||||
query = build_search_query(sqlite)
|
||||
if not query:
|
||||
return
|
||||
return 0
|
||||
|
||||
if config.scraper.debug:
|
||||
_log('query: %s' % query, 'debug')
|
||||
@@ -339,15 +339,15 @@ def proxyfind(sqlite=None, urignore=None):
|
||||
# Use 1-3 engines per round
|
||||
num_engines = min(len(available), random.randint(1, 3))
|
||||
|
||||
total_new = 0
|
||||
for engine, ident in available[:num_engines]:
|
||||
total = scrape_engine(engine, ident, query, urignore, sqlite)
|
||||
if total > 0:
|
||||
name = ident.split('/')[2] if '/' in ident else ident
|
||||
_log('%s: found %d URLs' % (name, total), 'scraper')
|
||||
|
||||
new_urls = scrape_engine(engine, ident, query, urignore, sqlite)
|
||||
total_new += new_urls
|
||||
# Delay between engines
|
||||
time.sleep(random.uniform(2.0, 5.0))
|
||||
|
||||
return total_new
|
||||
|
||||
|
||||
def load_urignore():
|
||||
"""Load URL ignore patterns."""
|
||||
@@ -381,9 +381,25 @@ class Scraper(threading.Thread):
|
||||
self.cfg = cfg
|
||||
self.running = False
|
||||
self.urignore = load_urignore()
|
||||
# Stats tracking
|
||||
self.stats_interval = 900 # 15 minutes
|
||||
self.last_stats = 0
|
||||
self.new_urls = 0
|
||||
self.queries = 0
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
def log_stats(self):
|
||||
"""Log accumulated stats every 15 minutes."""
|
||||
now = time.time()
|
||||
if now - self.last_stats >= self.stats_interval:
|
||||
avail, backoff, total = engine_tracker.get_status()
|
||||
_log('new=%d queries=%d engines=%d/%d' % (
|
||||
self.new_urls, self.queries, avail, total), 'scraper')
|
||||
self.new_urls = 0
|
||||
self.queries = 0
|
||||
self.last_stats = now
|
||||
|
||||
def init_tracker(self):
|
||||
"""Initialize engine tracker with configured engines."""
|
||||
global engine_tracker
|
||||
@@ -414,9 +430,14 @@ class Scraper(threading.Thread):
|
||||
# Create thread-local database connection
|
||||
urldb = mysqlite.mysqlite(self.cfg.ppf.database, str)
|
||||
|
||||
self.last_stats = time.time()
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
proxyfind(urldb, self.urignore)
|
||||
new_count = proxyfind(urldb, self.urignore)
|
||||
self.new_urls += new_count
|
||||
self.queries += 1
|
||||
self.log_stats()
|
||||
time.sleep(random.uniform(5.0, 15.0))
|
||||
except Exception as e:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user