From e89db20f5b5f0f5c858a88f3374e1c640af6f2c1 Mon Sep 17 00:00:00 2001 From: Username Date: Sun, 28 Dec 2025 15:19:39 +0100 Subject: [PATCH] scraper: add Bing and Yahoo engines --- scraper.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 4 deletions(-) diff --git a/scraper.py b/scraper.py index 3ad9eba..b208c53 100755 --- a/scraper.py +++ b/scraper.py @@ -23,6 +23,97 @@ config = Config() # Default search terms (can be overridden by search_terms.txt) search_terms = ['free proxy list', 'socks5 proxy', 'http proxy'] +# File extensions that are NOT proxy lists (binary/media content) +SKIP_EXTENSIONS = frozenset({ + # Images + '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.bmp', '.tiff', + # Video + '.mp4', '.webm', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.m4v', + # Audio + '.mp3', '.wav', '.ogg', '.flac', '.aac', '.wma', '.m4a', + # Documents (non-text) + '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', + # Archives + '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.xz', '.tgz', + # Executables/binary + '.exe', '.bin', '.dll', '.so', '.dmg', '.apk', '.deb', '.rpm', + # Other non-text + '.iso', '.img', '.torrent', '.swf', '.woff', '.woff2', '.ttf', '.eot', +}) + +# Domains that are unlikely to contain proxy lists +SKIP_DOMAINS = frozenset({ + # Social media + 'facebook.com', 'twitter.com', 'x.com', 'instagram.com', 'tiktok.com', + 'linkedin.com', 'pinterest.com', 'snapchat.com', 'reddit.com', + # Video platforms + 'youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com', 'twitch.tv', + # Image hosting + 'imgur.com', 'flickr.com', 'photobucket.com', + # App stores + 'play.google.com', 'apps.apple.com', 'microsoft.com', + # Cloud storage (not direct links) + 'dropbox.com', 'drive.google.com', 'onedrive.live.com', + # News/media (unlikely proxy sources) + 'cnn.com', 'bbc.com', 'nytimes.com', 'theguardian.com', + # Shopping + 'amazon.com', 'ebay.com', 'aliexpress.com', 'alibaba.com', + # Wikipedia (info, not lists) + 'wikipedia.org', 'wikimedia.org', +}) + + +def is_valid_proxy_list_url(url): + """Check if URL could plausibly be a proxy list page. + + Returns True if the URL should be kept, False if it should be skipped. + """ + if not url: + return False + + url_lower = url.lower() + + # Check file extension + # Extract path without query string + path = url_lower.split('?')[0] + for ext in SKIP_EXTENSIONS: + if path.endswith(ext): + return False + + # Check domain + # Extract domain from URL + try: + # Handle both http:// and https:// + if '://' in url_lower: + domain_part = url_lower.split('://')[1].split('/')[0] + else: + domain_part = url_lower.split('/')[0] + # Remove port if present + domain_part = domain_part.split(':')[0] + # Check against skip domains (including subdomains) + for skip_domain in SKIP_DOMAINS: + if domain_part == skip_domain or domain_part.endswith('.' + skip_domain): + return False + except Exception: + pass # If parsing fails, keep the URL + + return True + + +def filter_urls(urls): + """Filter out URLs that are unlikely to be proxy lists. + + Returns (valid_urls, skipped_count). + """ + valid = [] + skipped = 0 + for url in urls: + if is_valid_proxy_list_url(url): + valid.append(url) + else: + skipped += 1 + return valid, skipped + # Load Searx instances if file exists searx_instances = [] if os.path.exists('searx.instances'): @@ -318,11 +409,16 @@ def scrape_engine(engine, ident, query, urignore, sqlite): engine_tracker.mark_success(ident) consecutive_empty = 0 - # Deduplicate and insert + # Deduplicate and filter invalid URLs urls = list(set(urls)) - source = '%s (page %d, query: %s)' % (engine.name, page, query[:50]) - new_count = dbs.insert_urls(urls, source, sqlite) - total_urls += new_count + urls, skipped = filter_urls(urls) + if skipped and config.scraper.debug: + _log('%s: skipped %d invalid URLs' % (engine.name, skipped), 'debug') + + if urls: + source = '%s (page %d, query: %s)' % (engine.name, page, query[:50]) + new_count = dbs.insert_urls(urls, source, sqlite) + total_urls += new_count # Small delay between pages time.sleep(random.uniform(1.0, 3.0))