scraper: add Bing and Yahoo engines
This commit is contained in:
98
scraper.py
98
scraper.py
@@ -23,6 +23,97 @@ config = Config()
|
|||||||
# Default search terms (can be overridden by search_terms.txt)
|
# Default search terms (can be overridden by search_terms.txt)
|
||||||
search_terms = ['free proxy list', 'socks5 proxy', 'http proxy']
|
search_terms = ['free proxy list', 'socks5 proxy', 'http proxy']
|
||||||
|
|
||||||
|
# File extensions that are NOT proxy lists (binary/media content)
|
||||||
|
SKIP_EXTENSIONS = frozenset({
|
||||||
|
# Images
|
||||||
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.bmp', '.tiff',
|
||||||
|
# Video
|
||||||
|
'.mp4', '.webm', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.m4v',
|
||||||
|
# Audio
|
||||||
|
'.mp3', '.wav', '.ogg', '.flac', '.aac', '.wma', '.m4a',
|
||||||
|
# Documents (non-text)
|
||||||
|
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt',
|
||||||
|
# Archives
|
||||||
|
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.xz', '.tgz',
|
||||||
|
# Executables/binary
|
||||||
|
'.exe', '.bin', '.dll', '.so', '.dmg', '.apk', '.deb', '.rpm',
|
||||||
|
# Other non-text
|
||||||
|
'.iso', '.img', '.torrent', '.swf', '.woff', '.woff2', '.ttf', '.eot',
|
||||||
|
})
|
||||||
|
|
||||||
|
# Domains that are unlikely to contain proxy lists
|
||||||
|
SKIP_DOMAINS = frozenset({
|
||||||
|
# Social media
|
||||||
|
'facebook.com', 'twitter.com', 'x.com', 'instagram.com', 'tiktok.com',
|
||||||
|
'linkedin.com', 'pinterest.com', 'snapchat.com', 'reddit.com',
|
||||||
|
# Video platforms
|
||||||
|
'youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com', 'twitch.tv',
|
||||||
|
# Image hosting
|
||||||
|
'imgur.com', 'flickr.com', 'photobucket.com',
|
||||||
|
# App stores
|
||||||
|
'play.google.com', 'apps.apple.com', 'microsoft.com',
|
||||||
|
# Cloud storage (not direct links)
|
||||||
|
'dropbox.com', 'drive.google.com', 'onedrive.live.com',
|
||||||
|
# News/media (unlikely proxy sources)
|
||||||
|
'cnn.com', 'bbc.com', 'nytimes.com', 'theguardian.com',
|
||||||
|
# Shopping
|
||||||
|
'amazon.com', 'ebay.com', 'aliexpress.com', 'alibaba.com',
|
||||||
|
# Wikipedia (info, not lists)
|
||||||
|
'wikipedia.org', 'wikimedia.org',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_proxy_list_url(url):
|
||||||
|
"""Check if URL could plausibly be a proxy list page.
|
||||||
|
|
||||||
|
Returns True if the URL should be kept, False if it should be skipped.
|
||||||
|
"""
|
||||||
|
if not url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
url_lower = url.lower()
|
||||||
|
|
||||||
|
# Check file extension
|
||||||
|
# Extract path without query string
|
||||||
|
path = url_lower.split('?')[0]
|
||||||
|
for ext in SKIP_EXTENSIONS:
|
||||||
|
if path.endswith(ext):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check domain
|
||||||
|
# Extract domain from URL
|
||||||
|
try:
|
||||||
|
# Handle both http:// and https://
|
||||||
|
if '://' in url_lower:
|
||||||
|
domain_part = url_lower.split('://')[1].split('/')[0]
|
||||||
|
else:
|
||||||
|
domain_part = url_lower.split('/')[0]
|
||||||
|
# Remove port if present
|
||||||
|
domain_part = domain_part.split(':')[0]
|
||||||
|
# Check against skip domains (including subdomains)
|
||||||
|
for skip_domain in SKIP_DOMAINS:
|
||||||
|
if domain_part == skip_domain or domain_part.endswith('.' + skip_domain):
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
pass # If parsing fails, keep the URL
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def filter_urls(urls):
|
||||||
|
"""Filter out URLs that are unlikely to be proxy lists.
|
||||||
|
|
||||||
|
Returns (valid_urls, skipped_count).
|
||||||
|
"""
|
||||||
|
valid = []
|
||||||
|
skipped = 0
|
||||||
|
for url in urls:
|
||||||
|
if is_valid_proxy_list_url(url):
|
||||||
|
valid.append(url)
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
return valid, skipped
|
||||||
|
|
||||||
# Load Searx instances if file exists
|
# Load Searx instances if file exists
|
||||||
searx_instances = []
|
searx_instances = []
|
||||||
if os.path.exists('searx.instances'):
|
if os.path.exists('searx.instances'):
|
||||||
@@ -318,8 +409,13 @@ def scrape_engine(engine, ident, query, urignore, sqlite):
|
|||||||
engine_tracker.mark_success(ident)
|
engine_tracker.mark_success(ident)
|
||||||
consecutive_empty = 0
|
consecutive_empty = 0
|
||||||
|
|
||||||
# Deduplicate and insert
|
# Deduplicate and filter invalid URLs
|
||||||
urls = list(set(urls))
|
urls = list(set(urls))
|
||||||
|
urls, skipped = filter_urls(urls)
|
||||||
|
if skipped and config.scraper.debug:
|
||||||
|
_log('%s: skipped %d invalid URLs' % (engine.name, skipped), 'debug')
|
||||||
|
|
||||||
|
if urls:
|
||||||
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
source = '%s (page %d, query: %s)' % (engine.name, page, query[:50])
|
||||||
new_count = dbs.insert_urls(urls, source, sqlite)
|
new_count = dbs.insert_urls(urls, source, sqlite)
|
||||||
total_urls += new_count
|
total_urls += new_count
|
||||||
|
|||||||
Reference in New Issue
Block a user