scraper: reuse connections, cycle circuit on block
All checks were successful
CI / syntax-check (push) Successful in 6s
CI / memory-leak-check (push) Successful in 15s

This commit is contained in:
Username
2025-12-25 19:26:23 +01:00
parent 68e8b88afa
commit 272eba0f05
2 changed files with 151 additions and 37 deletions

View File

@@ -277,52 +277,64 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
consecutive_empty = 0
total_urls = 0
for page in range(max_pages):
try:
url = engine.build_url(query, page)
# Use session for connection reuse within engine
session = fetch.FetchSession()
if config.scraper.debug:
_log('%s page %d: %s' % (engine.name, page, url), 'debug')
try:
for page in range(max_pages):
try:
url = engine.build_url(query, page)
content = fetch.fetch_contents(url)
if config.scraper.debug:
_log('%s page %d: %s' % (engine.name, page, url), 'debug')
# Check for rate limiting
if engine.is_rate_limited(content):
engine_tracker.mark_failure(ident)
return total_urls
content = session.fetch(url)
if not content:
consecutive_empty += 1
if consecutive_empty >= config.scraper.fail_threshold:
# Check for rate limiting
if engine.is_rate_limited(content):
engine_tracker.mark_failure(ident)
# Cycle to new circuit for next attempt
session.cycle()
return total_urls
continue
# Extract URLs
urls = engine.extract_urls(content, urignore)
if not content:
consecutive_empty += 1
if consecutive_empty >= config.scraper.fail_threshold:
engine_tracker.mark_failure(ident)
# Cycle to new circuit for next attempt
session.cycle()
return total_urls
continue
if not urls:
# Empty results on first page likely means rate limited
if page == 0:
engine_tracker.mark_failure(ident)
# Extract URLs
urls = engine.extract_urls(content, urignore)
if not urls:
# Empty results on first page likely means rate limited
if page == 0:
engine_tracker.mark_failure(ident)
session.cycle()
return total_urls
# Success
engine_tracker.mark_success(ident)
consecutive_empty = 0
# Deduplicate and insert
urls = list(set(urls))
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
new_count = dbs.insert_urls(urls, source, sqlite)
total_urls += new_count
# Small delay between pages
time.sleep(random.uniform(1.0, 3.0))
except Exception as e:
engine_tracker.mark_failure(ident)
session.cycle()
return total_urls
# Success
engine_tracker.mark_success(ident)
consecutive_empty = 0
# Deduplicate and insert
urls = list(set(urls))
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
new_count = dbs.insert_urls(urls, source, sqlite)
total_urls += new_count
# Small delay between pages
time.sleep(random.uniform(1.0, 3.0))
except Exception as e:
engine_tracker.mark_failure(ident)
return total_urls
finally:
session.close()
return total_urls