diff --git a/translations.py b/translations.py new file mode 100644 index 0000000..73636a3 --- /dev/null +++ b/translations.py @@ -0,0 +1,674 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +"""Multi-lingual search term generation with LibreTranslate support.""" + +import random +import os +import json +import time +import datetime +from misc import _log + +# Current year for freshness terms +CURRENT_YEAR = str(datetime.datetime.now().year) + +# Module-level configuration (set by set_config()) +_libretranslate_url = 'https://lt.mymx.me/translate' +_libretranslate_enabled = True +_libretranslate_timeout = 10 + +# Base English terms - categorized for better coverage +BASE_TERMS_GENERIC = [ + 'free proxy list', + 'proxy server list', + 'public proxy list', + 'open proxy list', +] + +BASE_TERMS_PROTOCOL = [ + 'socks5 proxy list', + 'socks4 proxy list', + 'http proxy list', + 'https proxy list', +] + +BASE_TERMS_ANONYMITY = [ + 'anonymous proxy', + 'elite proxy', + 'high anonymity proxy', + 'transparent proxy list', +] + +BASE_TERMS_FRESHNESS = [ + 'fresh proxy list', + 'working proxy list', + 'verified proxy list', + 'checked proxy list', + 'live proxy list', + 'proxy list today', + 'proxy list updated', + 'proxy list ' + CURRENT_YEAR, + 'new proxy list ' + CURRENT_YEAR, +] + +BASE_TERMS_FORMAT = [ + 'proxy list txt', + 'proxy list ip port', + 'proxy list download', + 'proxy txt file', +] + +BASE_TERMS_SOURCES = [ + 'proxy pastebin', + 'proxy github', + 'proxy list telegram', + 'free proxy api', +] + +BASE_TERMS_GEOGRAPHIC = [ + 'US proxy list', + 'USA proxy', + 'Europe proxy list', + 'Asia proxy list', + 'Russia proxy list', + 'China proxy list', +] + +BASE_TERMS_USECASE = [ + 'proxy for scraping', + 'fast proxy list', + 'residential proxy list', + 'datacenter proxy list', +] + +BASE_TERMS_SEARCH_OPS = [ + 'filetype:txt proxy list', + 'inurl:proxy.txt', + 'inurl:proxies.txt', + 'intitle:proxy list', +] + +# Combined list for random selection +BASE_TERMS = ( + BASE_TERMS_GENERIC + + BASE_TERMS_PROTOCOL + + BASE_TERMS_ANONYMITY + + BASE_TERMS_FRESHNESS + + BASE_TERMS_FORMAT + + BASE_TERMS_SOURCES + + BASE_TERMS_GEOGRAPHIC + + BASE_TERMS_USECASE + + BASE_TERMS_SEARCH_OPS +) + +# Terms that should be translated (exclude search operators and technical terms) +TRANSLATABLE_TERMS = ( + BASE_TERMS_GENERIC + + BASE_TERMS_ANONYMITY + + BASE_TERMS_FRESHNESS +) + +# Static translations - no API needed +# Format: {lang_code: {english_term: translated_term}} +STATIC_TRANSLATIONS = { + 'ru': { + 'free proxy list': u'бесплатный список прокси', + 'socks5 proxy': u'socks5 прокси', + 'socks4 proxy': u'socks4 прокси', + 'http proxy': u'http прокси', + 'proxy server list': u'список прокси серверов', + 'anonymous proxy': u'анонимный прокси', + 'elite proxy': u'элитный прокси', + 'fresh proxy': u'свежие прокси', + 'working proxy': u'рабочие прокси', + 'proxy list updated': u'обновленный список прокси', + }, + 'zh': { + 'free proxy list': u'免费代理列表', + 'socks5 proxy': u'socks5代理', + 'socks4 proxy': u'socks4代理', + 'http proxy': u'http代理', + 'proxy server list': u'代理服务器列表', + 'anonymous proxy': u'匿名代理', + 'elite proxy': u'高匿代理', + 'fresh proxy': u'最新代理', + 'working proxy': u'可用代理', + 'proxy list updated': u'代理列表更新', + }, + 'es': { + 'free proxy list': u'lista de proxies gratis', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'lista de servidores proxy', + 'anonymous proxy': u'proxy anónimo', + 'elite proxy': u'proxy elite', + 'fresh proxy': u'proxies frescos', + 'working proxy': u'proxies funcionando', + 'proxy list updated': u'lista de proxies actualizada', + }, + 'pt': { + 'free proxy list': u'lista de proxy grátis', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'lista de servidores proxy', + 'anonymous proxy': u'proxy anônimo', + 'elite proxy': u'proxy elite', + 'fresh proxy': u'proxies novos', + 'working proxy': u'proxies funcionando', + 'proxy list updated': u'lista de proxy atualizada', + }, + 'de': { + 'free proxy list': u'kostenlose Proxy-Liste', + 'socks5 proxy': u'socks5 Proxy', + 'socks4 proxy': u'socks4 Proxy', + 'http proxy': u'http Proxy', + 'proxy server list': u'Proxy-Server-Liste', + 'anonymous proxy': u'anonymer Proxy', + 'elite proxy': u'Elite-Proxy', + 'fresh proxy': u'frische Proxys', + 'working proxy': u'funktionierende Proxys', + 'proxy list updated': u'aktualisierte Proxy-Liste', + }, + 'fr': { + 'free proxy list': u'liste de proxy gratuit', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'liste de serveurs proxy', + 'anonymous proxy': u'proxy anonyme', + 'elite proxy': u'proxy élite', + 'fresh proxy': u'proxies frais', + 'working proxy': u'proxies fonctionnels', + 'proxy list updated': u'liste de proxy mise à jour', + }, + 'ja': { + 'free proxy list': u'無料プロキシリスト', + 'socks5 proxy': u'socks5プロキシ', + 'socks4 proxy': u'socks4プロキシ', + 'http proxy': u'httpプロキシ', + 'proxy server list': u'プロキシサーバーリスト', + 'anonymous proxy': u'匿名プロキシ', + 'elite proxy': u'エリートプロキシ', + 'fresh proxy': u'最新プロキシ', + 'working proxy': u'動作するプロキシ', + 'proxy list updated': u'プロキシリスト更新', + }, + 'ko': { + 'free proxy list': u'무료 프록시 목록', + 'socks5 proxy': u'socks5 프록시', + 'socks4 proxy': u'socks4 프록시', + 'http proxy': u'http 프록시', + 'proxy server list': u'프록시 서버 목록', + 'anonymous proxy': u'익명 프록시', + 'elite proxy': u'엘리트 프록시', + 'fresh proxy': u'최신 프록시', + 'working proxy': u'작동하는 프록시', + 'proxy list updated': u'프록시 목록 업데이트', + }, + 'ar': { + 'free proxy list': u'قائمة بروكسي مجانية', + 'socks5 proxy': u'بروكسي socks5', + 'socks4 proxy': u'بروكسي socks4', + 'http proxy': u'بروكسي http', + 'proxy server list': u'قائمة خوادم البروكسي', + 'anonymous proxy': u'بروكسي مجهول', + 'elite proxy': u'بروكسي نخبة', + 'fresh proxy': u'بروكسي جديد', + 'working proxy': u'بروكسي يعمل', + 'proxy list updated': u'قائمة بروكسي محدثة', + }, + 'id': { + 'free proxy list': u'daftar proxy gratis', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'daftar server proxy', + 'anonymous proxy': u'proxy anonim', + 'elite proxy': u'proxy elite', + 'fresh proxy': u'proxy baru', + 'working proxy': u'proxy aktif', + 'proxy list updated': u'daftar proxy diperbarui', + }, + 'tr': { + 'free proxy list': u'ücretsiz proxy listesi', + 'socks5 proxy': u'socks5 proxy', + 'socks4 proxy': u'socks4 proxy', + 'http proxy': u'http proxy', + 'proxy server list': u'proxy sunucu listesi', + 'anonymous proxy': u'anonim proxy', + 'elite proxy': u'elit proxy', + 'fresh proxy': u'güncel proxy', + 'working proxy': u'çalışan proxy', + 'proxy list updated': u'güncellenmiş proxy listesi', + }, + 'vi': { + 'free proxy list': u'danh sách proxy miễn phí', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'danh sách máy chủ proxy', + 'anonymous proxy': u'proxy ẩn danh', + 'elite proxy': u'proxy cao cấp', + 'fresh proxy': u'proxy mới', + 'working proxy': u'proxy hoạt động', + 'proxy list updated': u'danh sách proxy cập nhật', + }, + 'th': { + 'free proxy list': u'รายการพร็อกซี่ฟรี', + 'socks5 proxy': u'พร็อกซี่ socks5', + 'socks4 proxy': u'พร็อกซี่ socks4', + 'http proxy': u'พร็อกซี่ http', + 'proxy server list': u'รายการเซิร์ฟเวอร์พร็อกซี่', + 'anonymous proxy': u'พร็อกซี่นิรนาม', + 'elite proxy': u'พร็อกซี่ระดับสูง', + 'fresh proxy': u'พร็อกซี่ใหม่', + 'working proxy': u'พร็อกซี่ใช้งานได้', + 'proxy list updated': u'รายการพร็อกซี่อัพเดท', + }, + 'pl': { + 'free proxy list': u'darmowa lista proxy', + 'socks5 proxy': u'proxy socks5', + 'socks4 proxy': u'proxy socks4', + 'http proxy': u'proxy http', + 'proxy server list': u'lista serwerów proxy', + 'anonymous proxy': u'anonimowe proxy', + 'elite proxy': u'elitarne proxy', + 'fresh proxy': u'świeże proxy', + 'working proxy': u'działające proxy', + 'proxy list updated': u'zaktualizowana lista proxy', + }, + 'uk': { + 'free proxy list': u'безкоштовний список проксі', + 'socks5 proxy': u'socks5 проксі', + 'socks4 proxy': u'socks4 проксі', + 'http proxy': u'http проксі', + 'proxy server list': u'список проксі серверів', + 'anonymous proxy': u'анонімний проксі', + 'elite proxy': u'елітний проксі', + 'fresh proxy': u'свіжі проксі', + 'working proxy': u'робочі проксі', + 'proxy list updated': u'оновлений список проксі', + }, +} + +# All available languages +LANGUAGES = list(STATIC_TRANSLATIONS.keys()) + ['en'] + +# LibreTranslate available languages (populated dynamically) +_libretranslate_langs = set() +_libretranslate_langs_checked = False + +# Cache for online translations +_translation_cache = {} +_cache_file = 'translation_cache.json' +_cache_max_age = 86400 * 30 # 30 days +_failed_translations = {} # Track failed translations to avoid repeated API calls +_failed_cache_ttl = 3600 # 1 hour before retrying failed translations + + +def set_config(config): + """Configure translation settings from config object. + + Args: + config: Config object with scraper.libretranslate_url and + scraper.libretranslate_enabled attributes + """ + global _libretranslate_url, _libretranslate_enabled + if hasattr(config, 'scraper'): + if hasattr(config.scraper, 'libretranslate_url'): + _libretranslate_url = config.scraper.libretranslate_url + if hasattr(config.scraper, 'libretranslate_enabled'): + _libretranslate_enabled = config.scraper.libretranslate_enabled + + if _libretranslate_enabled: + _fetch_available_languages() + _log('LibreTranslate: enabled (%s) - %d languages' % ( + _libretranslate_url, len(_libretranslate_langs) + ), 'info') + else: + _log('LibreTranslate: disabled', 'debug') + + +def _fetch_available_languages(): + """Fetch available languages from LibreTranslate API. + + Queries the /languages endpoint and extracts languages that can be + translated from English (source='en'). + """ + global _libretranslate_langs, _libretranslate_langs_checked + + if _libretranslate_langs_checked: + return + + _libretranslate_langs_checked = True + + # Derive base URL from translate endpoint + base_url = _libretranslate_url.rsplit('/', 1)[0] + languages_url = base_url + '/languages' + + try: + import urllib2 + req = urllib2.Request(languages_url) + req.add_header('Accept', 'application/json') + req.add_header('User-Agent', 'PPF/1.0') + resp = urllib2.urlopen(req, timeout=_libretranslate_timeout) + langs = json.loads(resp.read()) + + # Find English entry to get available target languages + en_targets = set() + for lang in langs: + if lang.get('code') == 'en': + en_targets = set(lang.get('targets', [])) + break + + # Also collect all language codes as fallback + all_codes = set() + for lang in langs: + code = lang.get('code', '') + if code.startswith('zh'): + all_codes.add('zh') + elif code: + all_codes.add(code) + + # Use English targets if available, otherwise all codes + if en_targets: + # Normalize zh variants + codes = set() + for code in en_targets: + if code.startswith('zh'): + codes.add('zh') + elif code: + codes.add(code) + codes.discard('en') + _libretranslate_langs = codes + else: + all_codes.discard('en') + _libretranslate_langs = all_codes + + _log('LibreTranslate languages: %s' % ', '.join(sorted(_libretranslate_langs)), 'debug') + + except Exception as e: + _log('failed to fetch LibreTranslate languages: %s' % str(e), 'warn') + _libretranslate_langs = set() + + +def _load_cache(): + """Load translation cache from disk.""" + global _translation_cache + if os.path.exists(_cache_file): + try: + with open(_cache_file, 'r') as f: + data = json.load(f) + # Handle both old format (just translations) and new format (with metadata) + if isinstance(data, dict) and '_meta' in data: + _translation_cache = data.get('translations', {}) + else: + _translation_cache = data + except (IOError, ValueError) as e: + _log('cache load failed: %s' % str(e), 'debug') + _translation_cache = {} + + +def _save_cache(): + """Save translation cache to disk.""" + try: + data = { + '_meta': { + 'version': 1, + 'updated': int(time.time()), + 'count': len(_translation_cache) + }, + 'translations': _translation_cache + } + with open(_cache_file, 'w') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + except IOError as e: + _log('cache save failed: %s' % str(e), 'debug') + + +def translate_libretranslate(text, target_lang, source_lang='en', api_url=None): + """Translate text using LibreTranslate API. + + Args: + text: Text to translate + target_lang: Target language code (e.g., 'ru', 'zh') + source_lang: Source language code (default: 'en') + api_url: LibreTranslate API URL (uses configured default if None) + + Returns: + Translated text or None on failure + """ + global _failed_translations + + if not _libretranslate_enabled: + return None + + if api_url is None: + api_url = _libretranslate_url + + # Map language codes (e.g., zh -> zh-Hans for the API) + api_target = target_lang + if target_lang == 'zh': + api_target = 'zh-Hans' + + # Check if target language is available (fetch if not checked yet) + if not _libretranslate_langs_checked: + _fetch_available_languages() + + if target_lang not in _libretranslate_langs: + return None + + cache_key = '%s:%s:%s' % (source_lang, target_lang, text) + + # Check cache first + if cache_key in _translation_cache: + cached = _translation_cache[cache_key] + # Handle both old format (string) and new format (dict with timestamp) + if isinstance(cached, dict): + if time.time() - cached.get('time', 0) < _cache_max_age: + return cached.get('text') + else: + return cached + + # Check if we recently failed this translation + if cache_key in _failed_translations: + if time.time() - _failed_translations[cache_key] < _failed_cache_ttl: + return None + + try: + import urllib2 + data = json.dumps({ + 'q': text, + 'source': source_lang, + 'target': api_target, + 'format': 'text', + }) + req = urllib2.Request(api_url, data) + req.add_header('Content-Type', 'application/json') + req.add_header('User-Agent', 'PPF/1.0') + resp = urllib2.urlopen(req, timeout=_libretranslate_timeout) + result = json.loads(resp.read()) + + if 'translatedText' in result: + translated = result['translatedText'] + # Store with timestamp for cache expiry + _translation_cache[cache_key] = { + 'text': translated, + 'time': int(time.time()) + } + _save_cache() + _log('translated [%s]: %s -> %s' % (target_lang, text, translated), 'debug') + return translated + + except Exception as e: + _failed_translations[cache_key] = time.time() + _log('translation failed [%s->%s]: %s' % (source_lang, target_lang, str(e)), 'debug') + + return None + + +def get_cache_stats(): + """Return cache statistics. + + Returns: + dict with cache stats + """ + return { + 'entries': len(_translation_cache), + 'failed_pending': len(_failed_translations), + 'file': _cache_file, + 'enabled': _libretranslate_enabled, + 'url': _libretranslate_url, + 'api_languages': len(_libretranslate_langs), + } + + +def get_translated_term(term=None, lang=None, use_api=True): + """Get a search term, optionally translated. + + Args: + term: Specific term to translate (or random if None) + lang: Target language (or random if None) + use_api: Whether to use LibreTranslate API for missing translations + + Returns: + (term, lang) tuple + """ + # Expand language list to include LibreTranslate-only languages + all_langs = list(set(LANGUAGES) | _libretranslate_langs) + + if lang is None: + lang = random.choice(all_langs) if all_langs else 'en' + + if lang == 'en': + # For English, use full term list + if term is None: + term = random.choice(BASE_TERMS) + return term, lang + + # For other languages, pick from translatable terms + if term is None: + term = random.choice(TRANSLATABLE_TERMS) + + # Try static translations first + if lang in STATIC_TRANSLATIONS: + translations = STATIC_TRANSLATIONS[lang] + if term in translations: + return translations[term], lang + + # Try LibreTranslate API for missing translations + if use_api and _libretranslate_enabled and lang in _libretranslate_langs: + translated = translate_libretranslate(term, lang) + if translated: + return translated, lang + + # Fall back to English with full term list + return random.choice(BASE_TERMS), 'en' + + +def get_random_search_term(): + """Get a random search term in a random language. + + Returns: + Translated search term string + """ + term, lang = get_translated_term() + return term + + +def get_all_terms_for_language(lang): + """Get all search terms for a specific language. + + Args: + lang: Language code + + Returns: + List of translated terms + """ + if lang == 'en': + return BASE_TERMS[:] + + if lang in STATIC_TRANSLATIONS: + return list(STATIC_TRANSLATIONS[lang].values()) + + return BASE_TERMS[:] + + +def get_mixed_terms(count=5, english_weight=0.3): + """Get a mix of terms from different languages. + + Args: + count: Number of terms to return + english_weight: Probability of including English terms + + Returns: + List of search terms in various languages + """ + terms = [] + for _ in range(count): + if random.random() < english_weight: + terms.append(random.choice(BASE_TERMS)) + else: + terms.append(get_random_search_term()) + return terms + + +# Load cache on module import +_load_cache() + + +if __name__ == '__main__': + import sys + + # Fetch available languages from API + _fetch_available_languages() + + # Test output + print('LibreTranslate: %s' % ('enabled' if _libretranslate_enabled else 'disabled')) + print('API URL: %s' % _libretranslate_url) + print('Static languages: %s' % ', '.join(sorted(STATIC_TRANSLATIONS.keys()))) + api_only = _libretranslate_langs - set(STATIC_TRANSLATIONS.keys()) + print('API-only languages: %s' % (', '.join(sorted(api_only)) if api_only else 'none')) + print('') + + # Cache stats + stats = get_cache_stats() + print('Cache: %d entries in %s' % (stats['entries'], stats['file'])) + print('') + + print('Sample static translations:') + for lang in sorted(STATIC_TRANSLATIONS.keys())[:5]: + term, _ = get_translated_term('free proxy list', lang, use_api=False) + if isinstance(term, unicode): + print(' [%s] %s' % (lang, term.encode('utf-8'))) + else: + print(' [%s] %s' % (lang, term)) + + print('') + + # Test LibreTranslate if --test-api flag + if '--test-api' in sys.argv: + print('Testing LibreTranslate API...') + # Use languages that are API-available but not in static translations + test_langs = list(api_only)[:5] if api_only else ['fr', 'ar'] + for lang in test_langs: + term, result_lang = get_translated_term('free proxy list', lang, use_api=True) + if isinstance(term, unicode): + print(' [%s] %s' % (result_lang, term.encode('utf-8'))) + else: + print(' [%s] %s' % (result_lang, term)) + print('') + stats = get_cache_stats() + print('Cache after API test: %d entries' % stats['entries']) + else: + print('Run with --test-api to test LibreTranslate API') + + print('') + print('Random mixed terms:') + for term in get_mixed_terms(10, english_weight=0.2): + if isinstance(term, unicode): + print(' ', term.encode('utf-8')) + else: + print(' ', term)