diff --git a/scraper.py b/scraper.py index bca9ced..0bf49f4 100755 --- a/scraper.py +++ b/scraper.py @@ -300,8 +300,8 @@ def scrape_engine(engine, ident, query, urignore, sqlite): # Deduplicate and insert urls = list(set(urls)) source = '%s (page %d, query: %s)' % (engine.name, page, query[:50]) - dbs.insert_urls(urls, source, sqlite) - total_urls += len(urls) + new_count = dbs.insert_urls(urls, source, sqlite) + total_urls += new_count # Small delay between pages time.sleep(random.uniform(1.0, 3.0)) @@ -323,12 +323,12 @@ def proxyfind(sqlite=None, urignore=None): avail, backoff, total = engine_tracker.get_status() _log('all %d engines in backoff, sleeping 60s' % total, 'rate') time.sleep(60) - return + return 0 # Build search query query = build_search_query(sqlite) if not query: - return + return 0 if config.scraper.debug: _log('query: %s' % query, 'debug') @@ -339,15 +339,15 @@ def proxyfind(sqlite=None, urignore=None): # Use 1-3 engines per round num_engines = min(len(available), random.randint(1, 3)) + total_new = 0 for engine, ident in available[:num_engines]: - total = scrape_engine(engine, ident, query, urignore, sqlite) - if total > 0: - name = ident.split('/')[2] if '/' in ident else ident - _log('%s: found %d URLs' % (name, total), 'scraper') - + new_urls = scrape_engine(engine, ident, query, urignore, sqlite) + total_new += new_urls # Delay between engines time.sleep(random.uniform(2.0, 5.0)) + return total_new + def load_urignore(): """Load URL ignore patterns.""" @@ -381,9 +381,25 @@ class Scraper(threading.Thread): self.cfg = cfg self.running = False self.urignore = load_urignore() + # Stats tracking + self.stats_interval = 900 # 15 minutes + self.last_stats = 0 + self.new_urls = 0 + self.queries = 0 threading.Thread.__init__(self) self.daemon = True + def log_stats(self): + """Log accumulated stats every 15 minutes.""" + now = time.time() + if now - self.last_stats >= self.stats_interval: + avail, backoff, total = engine_tracker.get_status() + _log('new=%d queries=%d engines=%d/%d' % ( + self.new_urls, self.queries, avail, total), 'scraper') + self.new_urls = 0 + self.queries = 0 + self.last_stats = now + def init_tracker(self): """Initialize engine tracker with configured engines.""" global engine_tracker @@ -414,9 +430,14 @@ class Scraper(threading.Thread): # Create thread-local database connection urldb = mysqlite.mysqlite(self.cfg.ppf.database, str) + self.last_stats = time.time() + while self.running: try: - proxyfind(urldb, self.urignore) + new_count = proxyfind(urldb, self.urignore) + self.new_urls += new_count + self.queries += 1 + self.log_stats() time.sleep(random.uniform(5.0, 15.0)) except Exception as e: try: