#!/usr/bin/env python2 # -*- coding: utf-8 -*- """Multi-lingual search term generation with LibreTranslate support.""" import random import os import json import time import datetime from misc import _log # Current year for freshness terms CURRENT_YEAR = str(datetime.datetime.now().year) # Module-level configuration (set by set_config()) _libretranslate_url = 'https://lt.mymx.me/translate' _libretranslate_enabled = True _libretranslate_timeout = 10 # Base English terms - categorized for better coverage BASE_TERMS_GENERIC = [ 'free proxy list', 'proxy server list', 'public proxy list', 'open proxy list', ] BASE_TERMS_PROTOCOL = [ 'socks5 proxy list', 'socks4 proxy list', 'http proxy list', 'https proxy list', ] BASE_TERMS_ANONYMITY = [ 'anonymous proxy', 'elite proxy', 'high anonymity proxy', 'transparent proxy list', ] BASE_TERMS_FRESHNESS = [ 'fresh proxy list', 'working proxy list', 'verified proxy list', 'checked proxy list', 'live proxy list', 'proxy list today', 'proxy list updated', 'proxy list ' + CURRENT_YEAR, 'new proxy list ' + CURRENT_YEAR, ] BASE_TERMS_FORMAT = [ 'proxy list txt', 'proxy list ip port', 'proxy list download', 'proxy txt file', ] BASE_TERMS_SOURCES = [ 'proxy pastebin', 'proxy github', 'proxy list telegram', 'free proxy api', ] BASE_TERMS_GEOGRAPHIC = [ 'US proxy list', 'USA proxy', 'Europe proxy list', 'Asia proxy list', 'Russia proxy list', 'China proxy list', ] BASE_TERMS_USECASE = [ 'proxy for scraping', 'fast proxy list', 'residential proxy list', 'datacenter proxy list', ] BASE_TERMS_SEARCH_OPS = [ 'filetype:txt proxy list', 'inurl:proxy.txt', 'inurl:proxies.txt', 'intitle:proxy list', ] # Combined list for random selection BASE_TERMS = ( BASE_TERMS_GENERIC + BASE_TERMS_PROTOCOL + BASE_TERMS_ANONYMITY + BASE_TERMS_FRESHNESS + BASE_TERMS_FORMAT + BASE_TERMS_SOURCES + BASE_TERMS_GEOGRAPHIC + BASE_TERMS_USECASE + BASE_TERMS_SEARCH_OPS ) # Terms that should be translated (exclude search operators and technical terms) TRANSLATABLE_TERMS = ( BASE_TERMS_GENERIC + BASE_TERMS_ANONYMITY + BASE_TERMS_FRESHNESS ) # Static translations - no API needed # Format: {lang_code: {english_term: translated_term}} STATIC_TRANSLATIONS = { 'ru': { 'free proxy list': u'бесплатный список прокси', 'socks5 proxy': u'socks5 прокси', 'socks4 proxy': u'socks4 прокси', 'http proxy': u'http прокси', 'proxy server list': u'список прокси серверов', 'anonymous proxy': u'анонимный прокси', 'elite proxy': u'элитный прокси', 'fresh proxy': u'свежие прокси', 'working proxy': u'рабочие прокси', 'proxy list updated': u'обновленный список прокси', }, 'zh': { 'free proxy list': u'免费代理列表', 'socks5 proxy': u'socks5代理', 'socks4 proxy': u'socks4代理', 'http proxy': u'http代理', 'proxy server list': u'代理服务器列表', 'anonymous proxy': u'匿名代理', 'elite proxy': u'高匿代理', 'fresh proxy': u'最新代理', 'working proxy': u'可用代理', 'proxy list updated': u'代理列表更新', }, 'es': { 'free proxy list': u'lista de proxies gratis', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'lista de servidores proxy', 'anonymous proxy': u'proxy anónimo', 'elite proxy': u'proxy elite', 'fresh proxy': u'proxies frescos', 'working proxy': u'proxies funcionando', 'proxy list updated': u'lista de proxies actualizada', }, 'pt': { 'free proxy list': u'lista de proxy grátis', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'lista de servidores proxy', 'anonymous proxy': u'proxy anônimo', 'elite proxy': u'proxy elite', 'fresh proxy': u'proxies novos', 'working proxy': u'proxies funcionando', 'proxy list updated': u'lista de proxy atualizada', }, 'de': { 'free proxy list': u'kostenlose Proxy-Liste', 'socks5 proxy': u'socks5 Proxy', 'socks4 proxy': u'socks4 Proxy', 'http proxy': u'http Proxy', 'proxy server list': u'Proxy-Server-Liste', 'anonymous proxy': u'anonymer Proxy', 'elite proxy': u'Elite-Proxy', 'fresh proxy': u'frische Proxys', 'working proxy': u'funktionierende Proxys', 'proxy list updated': u'aktualisierte Proxy-Liste', }, 'fr': { 'free proxy list': u'liste de proxy gratuit', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'liste de serveurs proxy', 'anonymous proxy': u'proxy anonyme', 'elite proxy': u'proxy élite', 'fresh proxy': u'proxies frais', 'working proxy': u'proxies fonctionnels', 'proxy list updated': u'liste de proxy mise à jour', }, 'ja': { 'free proxy list': u'無料プロキシリスト', 'socks5 proxy': u'socks5プロキシ', 'socks4 proxy': u'socks4プロキシ', 'http proxy': u'httpプロキシ', 'proxy server list': u'プロキシサーバーリスト', 'anonymous proxy': u'匿名プロキシ', 'elite proxy': u'エリートプロキシ', 'fresh proxy': u'最新プロキシ', 'working proxy': u'動作するプロキシ', 'proxy list updated': u'プロキシリスト更新', }, 'ko': { 'free proxy list': u'무료 프록시 목록', 'socks5 proxy': u'socks5 프록시', 'socks4 proxy': u'socks4 프록시', 'http proxy': u'http 프록시', 'proxy server list': u'프록시 서버 목록', 'anonymous proxy': u'익명 프록시', 'elite proxy': u'엘리트 프록시', 'fresh proxy': u'최신 프록시', 'working proxy': u'작동하는 프록시', 'proxy list updated': u'프록시 목록 업데이트', }, 'ar': { 'free proxy list': u'قائمة بروكسي مجانية', 'socks5 proxy': u'بروكسي socks5', 'socks4 proxy': u'بروكسي socks4', 'http proxy': u'بروكسي http', 'proxy server list': u'قائمة خوادم البروكسي', 'anonymous proxy': u'بروكسي مجهول', 'elite proxy': u'بروكسي نخبة', 'fresh proxy': u'بروكسي جديد', 'working proxy': u'بروكسي يعمل', 'proxy list updated': u'قائمة بروكسي محدثة', }, 'id': { 'free proxy list': u'daftar proxy gratis', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'daftar server proxy', 'anonymous proxy': u'proxy anonim', 'elite proxy': u'proxy elite', 'fresh proxy': u'proxy baru', 'working proxy': u'proxy aktif', 'proxy list updated': u'daftar proxy diperbarui', }, 'tr': { 'free proxy list': u'ücretsiz proxy listesi', 'socks5 proxy': u'socks5 proxy', 'socks4 proxy': u'socks4 proxy', 'http proxy': u'http proxy', 'proxy server list': u'proxy sunucu listesi', 'anonymous proxy': u'anonim proxy', 'elite proxy': u'elit proxy', 'fresh proxy': u'güncel proxy', 'working proxy': u'çalışan proxy', 'proxy list updated': u'güncellenmiş proxy listesi', }, 'vi': { 'free proxy list': u'danh sách proxy miễn phí', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'danh sách máy chủ proxy', 'anonymous proxy': u'proxy ẩn danh', 'elite proxy': u'proxy cao cấp', 'fresh proxy': u'proxy mới', 'working proxy': u'proxy hoạt động', 'proxy list updated': u'danh sách proxy cập nhật', }, 'th': { 'free proxy list': u'รายการพร็อกซี่ฟรี', 'socks5 proxy': u'พร็อกซี่ socks5', 'socks4 proxy': u'พร็อกซี่ socks4', 'http proxy': u'พร็อกซี่ http', 'proxy server list': u'รายการเซิร์ฟเวอร์พร็อกซี่', 'anonymous proxy': u'พร็อกซี่นิรนาม', 'elite proxy': u'พร็อกซี่ระดับสูง', 'fresh proxy': u'พร็อกซี่ใหม่', 'working proxy': u'พร็อกซี่ใช้งานได้', 'proxy list updated': u'รายการพร็อกซี่อัพเดท', }, 'pl': { 'free proxy list': u'darmowa lista proxy', 'socks5 proxy': u'proxy socks5', 'socks4 proxy': u'proxy socks4', 'http proxy': u'proxy http', 'proxy server list': u'lista serwerów proxy', 'anonymous proxy': u'anonimowe proxy', 'elite proxy': u'elitarne proxy', 'fresh proxy': u'świeże proxy', 'working proxy': u'działające proxy', 'proxy list updated': u'zaktualizowana lista proxy', }, 'uk': { 'free proxy list': u'безкоштовний список проксі', 'socks5 proxy': u'socks5 проксі', 'socks4 proxy': u'socks4 проксі', 'http proxy': u'http проксі', 'proxy server list': u'список проксі серверів', 'anonymous proxy': u'анонімний проксі', 'elite proxy': u'елітний проксі', 'fresh proxy': u'свіжі проксі', 'working proxy': u'робочі проксі', 'proxy list updated': u'оновлений список проксі', }, } # All available languages LANGUAGES = list(STATIC_TRANSLATIONS.keys()) + ['en'] # LibreTranslate available languages (populated dynamically) _libretranslate_langs = set() _libretranslate_langs_checked = False # Cache for online translations _translation_cache = {} _cache_file = 'translation_cache.json' _cache_max_age = 86400 * 30 # 30 days _failed_translations = {} # Track failed translations to avoid repeated API calls _failed_cache_ttl = 3600 # 1 hour before retrying failed translations def set_config(config): """Configure translation settings from config object. Args: config: Config object with scraper.libretranslate_url and scraper.libretranslate_enabled attributes """ global _libretranslate_url, _libretranslate_enabled if hasattr(config, 'scraper'): if hasattr(config.scraper, 'libretranslate_url'): _libretranslate_url = config.scraper.libretranslate_url if hasattr(config.scraper, 'libretranslate_enabled'): _libretranslate_enabled = config.scraper.libretranslate_enabled if _libretranslate_enabled: _fetch_available_languages() _log('LibreTranslate: enabled (%s) - %d languages' % ( _libretranslate_url, len(_libretranslate_langs) ), 'info') else: _log('LibreTranslate: disabled', 'debug') def _fetch_available_languages(): """Fetch available languages from LibreTranslate API. Queries the /languages endpoint and extracts languages that can be translated from English (source='en'). """ global _libretranslate_langs, _libretranslate_langs_checked if _libretranslate_langs_checked: return _libretranslate_langs_checked = True # Derive base URL from translate endpoint base_url = _libretranslate_url.rsplit('/', 1)[0] languages_url = base_url + '/languages' try: import urllib2 req = urllib2.Request(languages_url) req.add_header('Accept', 'application/json') req.add_header('User-Agent', 'PPF/1.0') resp = urllib2.urlopen(req, timeout=_libretranslate_timeout) langs = json.loads(resp.read()) # Find English entry to get available target languages en_targets = set() for lang in langs: if lang.get('code') == 'en': en_targets = set(lang.get('targets', [])) break # Also collect all language codes as fallback all_codes = set() for lang in langs: code = lang.get('code', '') if code.startswith('zh'): all_codes.add('zh') elif code: all_codes.add(code) # Use English targets if available, otherwise all codes if en_targets: # Normalize zh variants codes = set() for code in en_targets: if code.startswith('zh'): codes.add('zh') elif code: codes.add(code) codes.discard('en') _libretranslate_langs = codes else: all_codes.discard('en') _libretranslate_langs = all_codes _log('LibreTranslate languages: %s' % ', '.join(sorted(_libretranslate_langs)), 'debug') except Exception as e: _log('failed to fetch LibreTranslate languages: %s' % str(e), 'warn') _libretranslate_langs = set() def _load_cache(): """Load translation cache from disk.""" global _translation_cache if os.path.exists(_cache_file): try: with open(_cache_file, 'r') as f: data = json.load(f) # Handle both old format (just translations) and new format (with metadata) if isinstance(data, dict) and '_meta' in data: _translation_cache = data.get('translations', {}) else: _translation_cache = data except (IOError, ValueError) as e: _log('cache load failed: %s' % str(e), 'debug') _translation_cache = {} def _save_cache(): """Save translation cache to disk.""" try: data = { '_meta': { 'version': 1, 'updated': int(time.time()), 'count': len(_translation_cache) }, 'translations': _translation_cache } with open(_cache_file, 'w') as f: json.dump(data, f, ensure_ascii=False, indent=2) except IOError as e: _log('cache save failed: %s' % str(e), 'debug') def translate_libretranslate(text, target_lang, source_lang='en', api_url=None): """Translate text using LibreTranslate API. Args: text: Text to translate target_lang: Target language code (e.g., 'ru', 'zh') source_lang: Source language code (default: 'en') api_url: LibreTranslate API URL (uses configured default if None) Returns: Translated text or None on failure """ global _failed_translations if not _libretranslate_enabled: return None if api_url is None: api_url = _libretranslate_url # Map language codes (e.g., zh -> zh-Hans for the API) api_target = target_lang if target_lang == 'zh': api_target = 'zh-Hans' # Check if target language is available (fetch if not checked yet) if not _libretranslate_langs_checked: _fetch_available_languages() if target_lang not in _libretranslate_langs: return None cache_key = '%s:%s:%s' % (source_lang, target_lang, text) # Check cache first if cache_key in _translation_cache: cached = _translation_cache[cache_key] # Handle both old format (string) and new format (dict with timestamp) if isinstance(cached, dict): if time.time() - cached.get('time', 0) < _cache_max_age: return cached.get('text') else: return cached # Check if we recently failed this translation if cache_key in _failed_translations: if time.time() - _failed_translations[cache_key] < _failed_cache_ttl: return None try: import urllib2 data = json.dumps({ 'q': text, 'source': source_lang, 'target': api_target, 'format': 'text', }) req = urllib2.Request(api_url, data) req.add_header('Content-Type', 'application/json') req.add_header('User-Agent', 'PPF/1.0') resp = urllib2.urlopen(req, timeout=_libretranslate_timeout) result = json.loads(resp.read()) if 'translatedText' in result: translated = result['translatedText'] # Store with timestamp for cache expiry _translation_cache[cache_key] = { 'text': translated, 'time': int(time.time()) } _save_cache() _log('translated [%s]: %s -> %s' % (target_lang, text, translated), 'debug') return translated except Exception as e: _failed_translations[cache_key] = time.time() _log('translation failed [%s->%s]: %s' % (source_lang, target_lang, str(e)), 'debug') return None def get_cache_stats(): """Return cache statistics. Returns: dict with cache stats """ return { 'entries': len(_translation_cache), 'failed_pending': len(_failed_translations), 'file': _cache_file, 'enabled': _libretranslate_enabled, 'url': _libretranslate_url, 'api_languages': len(_libretranslate_langs), } def get_translated_term(term=None, lang=None, use_api=True): """Get a search term, optionally translated. Args: term: Specific term to translate (or random if None) lang: Target language (or random if None) use_api: Whether to use LibreTranslate API for missing translations Returns: (term, lang) tuple """ # Expand language list to include LibreTranslate-only languages all_langs = list(set(LANGUAGES) | _libretranslate_langs) if lang is None: lang = random.choice(all_langs) if all_langs else 'en' if lang == 'en': # For English, use full term list if term is None: term = random.choice(BASE_TERMS) return term, lang # For other languages, pick from translatable terms if term is None: term = random.choice(TRANSLATABLE_TERMS) # Try static translations first if lang in STATIC_TRANSLATIONS: translations = STATIC_TRANSLATIONS[lang] if term in translations: return translations[term], lang # Try LibreTranslate API for missing translations if use_api and _libretranslate_enabled and lang in _libretranslate_langs: translated = translate_libretranslate(term, lang) if translated: return translated, lang # Fall back to English with full term list return random.choice(BASE_TERMS), 'en' def get_random_search_term(): """Get a random search term in a random language. Returns: Translated search term string """ term, lang = get_translated_term() return term def get_all_terms_for_language(lang): """Get all search terms for a specific language. Args: lang: Language code Returns: List of translated terms """ if lang == 'en': return BASE_TERMS[:] if lang in STATIC_TRANSLATIONS: return list(STATIC_TRANSLATIONS[lang].values()) return BASE_TERMS[:] def get_mixed_terms(count=5, english_weight=0.3): """Get a mix of terms from different languages. Args: count: Number of terms to return english_weight: Probability of including English terms Returns: List of search terms in various languages """ terms = [] for _ in range(count): if random.random() < english_weight: terms.append(random.choice(BASE_TERMS)) else: terms.append(get_random_search_term()) return terms # Load cache on module import _load_cache() if __name__ == '__main__': import sys # Fetch available languages from API _fetch_available_languages() # Test output print('LibreTranslate: %s' % ('enabled' if _libretranslate_enabled else 'disabled')) print('API URL: %s' % _libretranslate_url) print('Static languages: %s' % ', '.join(sorted(STATIC_TRANSLATIONS.keys()))) api_only = _libretranslate_langs - set(STATIC_TRANSLATIONS.keys()) print('API-only languages: %s' % (', '.join(sorted(api_only)) if api_only else 'none')) print('') # Cache stats stats = get_cache_stats() print('Cache: %d entries in %s' % (stats['entries'], stats['file'])) print('') print('Sample static translations:') for lang in sorted(STATIC_TRANSLATIONS.keys())[:5]: term, _ = get_translated_term('free proxy list', lang, use_api=False) if isinstance(term, unicode): print(' [%s] %s' % (lang, term.encode('utf-8'))) else: print(' [%s] %s' % (lang, term)) print('') # Test LibreTranslate if --test-api flag if '--test-api' in sys.argv: print('Testing LibreTranslate API...') # Use languages that are API-available but not in static translations test_langs = list(api_only)[:5] if api_only else ['fr', 'ar'] for lang in test_langs: term, result_lang = get_translated_term('free proxy list', lang, use_api=True) if isinstance(term, unicode): print(' [%s] %s' % (result_lang, term.encode('utf-8'))) else: print(' [%s] %s' % (result_lang, term)) print('') stats = get_cache_stats() print('Cache after API test: %d entries' % stats['entries']) else: print('Run with --test-api to test LibreTranslate API') print('') print('Random mixed terms:') for term in get_mixed_terms(10, english_weight=0.2): if isinstance(term, unicode): print(' ', term.encode('utf-8')) else: print(' ', term)