scraper: reuse connections, cycle circuit on block
This commit is contained in:
86
scraper.py
86
scraper.py
@@ -277,52 +277,64 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
|
||||
consecutive_empty = 0
|
||||
total_urls = 0
|
||||
|
||||
for page in range(max_pages):
|
||||
try:
|
||||
url = engine.build_url(query, page)
|
||||
# Use session for connection reuse within engine
|
||||
session = fetch.FetchSession()
|
||||
|
||||
if config.scraper.debug:
|
||||
_log('%s page %d: %s' % (engine.name, page, url), 'debug')
|
||||
try:
|
||||
for page in range(max_pages):
|
||||
try:
|
||||
url = engine.build_url(query, page)
|
||||
|
||||
content = fetch.fetch_contents(url)
|
||||
if config.scraper.debug:
|
||||
_log('%s page %d: %s' % (engine.name, page, url), 'debug')
|
||||
|
||||
# Check for rate limiting
|
||||
if engine.is_rate_limited(content):
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
content = session.fetch(url)
|
||||
|
||||
if not content:
|
||||
consecutive_empty += 1
|
||||
if consecutive_empty >= config.scraper.fail_threshold:
|
||||
# Check for rate limiting
|
||||
if engine.is_rate_limited(content):
|
||||
engine_tracker.mark_failure(ident)
|
||||
# Cycle to new circuit for next attempt
|
||||
session.cycle()
|
||||
return total_urls
|
||||
continue
|
||||
|
||||
# Extract URLs
|
||||
urls = engine.extract_urls(content, urignore)
|
||||
if not content:
|
||||
consecutive_empty += 1
|
||||
if consecutive_empty >= config.scraper.fail_threshold:
|
||||
engine_tracker.mark_failure(ident)
|
||||
# Cycle to new circuit for next attempt
|
||||
session.cycle()
|
||||
return total_urls
|
||||
continue
|
||||
|
||||
if not urls:
|
||||
# Empty results on first page likely means rate limited
|
||||
if page == 0:
|
||||
engine_tracker.mark_failure(ident)
|
||||
# Extract URLs
|
||||
urls = engine.extract_urls(content, urignore)
|
||||
|
||||
if not urls:
|
||||
# Empty results on first page likely means rate limited
|
||||
if page == 0:
|
||||
engine_tracker.mark_failure(ident)
|
||||
session.cycle()
|
||||
return total_urls
|
||||
|
||||
# Success
|
||||
engine_tracker.mark_success(ident)
|
||||
consecutive_empty = 0
|
||||
|
||||
# Deduplicate and insert
|
||||
urls = list(set(urls))
|
||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||
new_count = dbs.insert_urls(urls, source, sqlite)
|
||||
total_urls += new_count
|
||||
|
||||
# Small delay between pages
|
||||
time.sleep(random.uniform(1.0, 3.0))
|
||||
|
||||
except Exception as e:
|
||||
engine_tracker.mark_failure(ident)
|
||||
session.cycle()
|
||||
return total_urls
|
||||
|
||||
# Success
|
||||
engine_tracker.mark_success(ident)
|
||||
consecutive_empty = 0
|
||||
|
||||
# Deduplicate and insert
|
||||
urls = list(set(urls))
|
||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||
new_count = dbs.insert_urls(urls, source, sqlite)
|
||||
total_urls += new_count
|
||||
|
||||
# Small delay between pages
|
||||
time.sleep(random.uniform(1.0, 3.0))
|
||||
|
||||
except Exception as e:
|
||||
engine_tracker.mark_failure(ident)
|
||||
return total_urls
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
return total_urls
|
||||
|
||||
|
||||
Reference in New Issue
Block a user