scraper: fix return values and add stats logging

- Return counts from proxyfind for tracking
- Add 15-minute stats interval logging
- Track new_urls and queries count
This commit is contained in:
Username
2025-12-24 00:19:53 +01:00
parent 689ea8153b
commit 33f9a211ce

View File

@@ -300,8 +300,8 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
# Deduplicate and insert
urls = list(set(urls))
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
dbs.insert_urls(urls, source, sqlite)
total_urls += len(urls)
new_count = dbs.insert_urls(urls, source, sqlite)
total_urls += new_count
# Small delay between pages
time.sleep(random.uniform(1.0, 3.0))
@@ -323,12 +323,12 @@ def proxyfind(sqlite=None, urignore=None):
avail, backoff, total = engine_tracker.get_status()
_log('all %d engines in backoff, sleeping 60s' % total, 'rate')
time.sleep(60)
return
return 0
# Build search query
query = build_search_query(sqlite)
if not query:
return
return 0
if config.scraper.debug:
_log('query: %s' % query, 'debug')
@@ -339,15 +339,15 @@ def proxyfind(sqlite=None, urignore=None):
# Use 1-3 engines per round
num_engines = min(len(available), random.randint(1, 3))
total_new = 0
for engine, ident in available[:num_engines]:
total = scrape_engine(engine, ident, query, urignore, sqlite)
if total > 0:
name = ident.split('/')[2] if '/' in ident else ident
_log('%s: found %d URLs' % (name, total), 'scraper')
new_urls = scrape_engine(engine, ident, query, urignore, sqlite)
total_new += new_urls
# Delay between engines
time.sleep(random.uniform(2.0, 5.0))
return total_new
def load_urignore():
"""Load URL ignore patterns."""
@@ -381,9 +381,25 @@ class Scraper(threading.Thread):
self.cfg = cfg
self.running = False
self.urignore = load_urignore()
# Stats tracking
self.stats_interval = 900 # 15 minutes
self.last_stats = 0
self.new_urls = 0
self.queries = 0
threading.Thread.__init__(self)
self.daemon = True
def log_stats(self):
"""Log accumulated stats every 15 minutes."""
now = time.time()
if now - self.last_stats >= self.stats_interval:
avail, backoff, total = engine_tracker.get_status()
_log('new=%d queries=%d engines=%d/%d' % (
self.new_urls, self.queries, avail, total), 'scraper')
self.new_urls = 0
self.queries = 0
self.last_stats = now
def init_tracker(self):
"""Initialize engine tracker with configured engines."""
global engine_tracker
@@ -414,9 +430,14 @@ class Scraper(threading.Thread):
# Create thread-local database connection
urldb = mysqlite.mysqlite(self.cfg.ppf.database, str)
self.last_stats = time.time()
while self.running:
try:
proxyfind(urldb, self.urignore)
new_count = proxyfind(urldb, self.urignore)
self.new_urls += new_count
self.queries += 1
self.log_stats()
time.sleep(random.uniform(5.0, 15.0))
except Exception as e:
try: