translations: add multi-lingual search term generation

- Static translations for 15 languages (ru, zh, es, pt, de, fr, ja, ko, ar, id, tr, vi, th, pl, uk)
- LibreTranslate API integration with configurable endpoint
- Dynamic language detection from API /languages endpoint
- Persistent JSON cache with 30-day TTL
- Categorized search terms: generic, protocol, anonymity, freshness, format, sources, geographic, use-case, search operators
- Dynamic year substitution for freshness terms
This commit is contained in:
Username
2025-12-20 22:27:37 +01:00
parent 4547ec3188
commit 8132023c97

674
translations.py Normal file
View File

@@ -0,0 +1,674 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""Multi-lingual search term generation with LibreTranslate support."""
import random
import os
import json
import time
import datetime
from misc import _log
# Current year for freshness terms
CURRENT_YEAR = str(datetime.datetime.now().year)
# Module-level configuration (set by set_config())
_libretranslate_url = 'https://lt.mymx.me/translate'
_libretranslate_enabled = True
_libretranslate_timeout = 10
# Base English terms - categorized for better coverage
BASE_TERMS_GENERIC = [
'free proxy list',
'proxy server list',
'public proxy list',
'open proxy list',
]
BASE_TERMS_PROTOCOL = [
'socks5 proxy list',
'socks4 proxy list',
'http proxy list',
'https proxy list',
]
BASE_TERMS_ANONYMITY = [
'anonymous proxy',
'elite proxy',
'high anonymity proxy',
'transparent proxy list',
]
BASE_TERMS_FRESHNESS = [
'fresh proxy list',
'working proxy list',
'verified proxy list',
'checked proxy list',
'live proxy list',
'proxy list today',
'proxy list updated',
'proxy list ' + CURRENT_YEAR,
'new proxy list ' + CURRENT_YEAR,
]
BASE_TERMS_FORMAT = [
'proxy list txt',
'proxy list ip port',
'proxy list download',
'proxy txt file',
]
BASE_TERMS_SOURCES = [
'proxy pastebin',
'proxy github',
'proxy list telegram',
'free proxy api',
]
BASE_TERMS_GEOGRAPHIC = [
'US proxy list',
'USA proxy',
'Europe proxy list',
'Asia proxy list',
'Russia proxy list',
'China proxy list',
]
BASE_TERMS_USECASE = [
'proxy for scraping',
'fast proxy list',
'residential proxy list',
'datacenter proxy list',
]
BASE_TERMS_SEARCH_OPS = [
'filetype:txt proxy list',
'inurl:proxy.txt',
'inurl:proxies.txt',
'intitle:proxy list',
]
# Combined list for random selection
BASE_TERMS = (
BASE_TERMS_GENERIC +
BASE_TERMS_PROTOCOL +
BASE_TERMS_ANONYMITY +
BASE_TERMS_FRESHNESS +
BASE_TERMS_FORMAT +
BASE_TERMS_SOURCES +
BASE_TERMS_GEOGRAPHIC +
BASE_TERMS_USECASE +
BASE_TERMS_SEARCH_OPS
)
# Terms that should be translated (exclude search operators and technical terms)
TRANSLATABLE_TERMS = (
BASE_TERMS_GENERIC +
BASE_TERMS_ANONYMITY +
BASE_TERMS_FRESHNESS
)
# Static translations - no API needed
# Format: {lang_code: {english_term: translated_term}}
STATIC_TRANSLATIONS = {
'ru': {
'free proxy list': u'бесплатный список прокси',
'socks5 proxy': u'socks5 прокси',
'socks4 proxy': u'socks4 прокси',
'http proxy': u'http прокси',
'proxy server list': u'список прокси серверов',
'anonymous proxy': u'анонимный прокси',
'elite proxy': u'элитный прокси',
'fresh proxy': u'свежие прокси',
'working proxy': u'рабочие прокси',
'proxy list updated': u'обновленный список прокси',
},
'zh': {
'free proxy list': u'免费代理列表',
'socks5 proxy': u'socks5代理',
'socks4 proxy': u'socks4代理',
'http proxy': u'http代理',
'proxy server list': u'代理服务器列表',
'anonymous proxy': u'匿名代理',
'elite proxy': u'高匿代理',
'fresh proxy': u'最新代理',
'working proxy': u'可用代理',
'proxy list updated': u'代理列表更新',
},
'es': {
'free proxy list': u'lista de proxies gratis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista de servidores proxy',
'anonymous proxy': u'proxy anónimo',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxies frescos',
'working proxy': u'proxies funcionando',
'proxy list updated': u'lista de proxies actualizada',
},
'pt': {
'free proxy list': u'lista de proxy grátis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista de servidores proxy',
'anonymous proxy': u'proxy anônimo',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxies novos',
'working proxy': u'proxies funcionando',
'proxy list updated': u'lista de proxy atualizada',
},
'de': {
'free proxy list': u'kostenlose Proxy-Liste',
'socks5 proxy': u'socks5 Proxy',
'socks4 proxy': u'socks4 Proxy',
'http proxy': u'http Proxy',
'proxy server list': u'Proxy-Server-Liste',
'anonymous proxy': u'anonymer Proxy',
'elite proxy': u'Elite-Proxy',
'fresh proxy': u'frische Proxys',
'working proxy': u'funktionierende Proxys',
'proxy list updated': u'aktualisierte Proxy-Liste',
},
'fr': {
'free proxy list': u'liste de proxy gratuit',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'liste de serveurs proxy',
'anonymous proxy': u'proxy anonyme',
'elite proxy': u'proxy élite',
'fresh proxy': u'proxies frais',
'working proxy': u'proxies fonctionnels',
'proxy list updated': u'liste de proxy mise à jour',
},
'ja': {
'free proxy list': u'無料プロキシリスト',
'socks5 proxy': u'socks5プロキシ',
'socks4 proxy': u'socks4プロキシ',
'http proxy': u'httpプロキシ',
'proxy server list': u'プロキシサーバーリスト',
'anonymous proxy': u'匿名プロキシ',
'elite proxy': u'エリートプロキシ',
'fresh proxy': u'最新プロキシ',
'working proxy': u'動作するプロキシ',
'proxy list updated': u'プロキシリスト更新',
},
'ko': {
'free proxy list': u'무료 프록시 목록',
'socks5 proxy': u'socks5 프록시',
'socks4 proxy': u'socks4 프록시',
'http proxy': u'http 프록시',
'proxy server list': u'프록시 서버 목록',
'anonymous proxy': u'익명 프록시',
'elite proxy': u'엘리트 프록시',
'fresh proxy': u'최신 프록시',
'working proxy': u'작동하는 프록시',
'proxy list updated': u'프록시 목록 업데이트',
},
'ar': {
'free proxy list': u'قائمة بروكسي مجانية',
'socks5 proxy': u'بروكسي socks5',
'socks4 proxy': u'بروكسي socks4',
'http proxy': u'بروكسي http',
'proxy server list': u'قائمة خوادم البروكسي',
'anonymous proxy': u'بروكسي مجهول',
'elite proxy': u'بروكسي نخبة',
'fresh proxy': u'بروكسي جديد',
'working proxy': u'بروكسي يعمل',
'proxy list updated': u'قائمة بروكسي محدثة',
},
'id': {
'free proxy list': u'daftar proxy gratis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'daftar server proxy',
'anonymous proxy': u'proxy anonim',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxy baru',
'working proxy': u'proxy aktif',
'proxy list updated': u'daftar proxy diperbarui',
},
'tr': {
'free proxy list': u'ücretsiz proxy listesi',
'socks5 proxy': u'socks5 proxy',
'socks4 proxy': u'socks4 proxy',
'http proxy': u'http proxy',
'proxy server list': u'proxy sunucu listesi',
'anonymous proxy': u'anonim proxy',
'elite proxy': u'elit proxy',
'fresh proxy': u'güncel proxy',
'working proxy': u'çalışan proxy',
'proxy list updated': u'güncellenmiş proxy listesi',
},
'vi': {
'free proxy list': u'danh sách proxy miễn phí',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'danh sách máy chủ proxy',
'anonymous proxy': u'proxy ẩn danh',
'elite proxy': u'proxy cao cấp',
'fresh proxy': u'proxy mới',
'working proxy': u'proxy hoạt động',
'proxy list updated': u'danh sách proxy cập nhật',
},
'th': {
'free proxy list': u'รายการพร็อกซี่ฟรี',
'socks5 proxy': u'พร็อกซี่ socks5',
'socks4 proxy': u'พร็อกซี่ socks4',
'http proxy': u'พร็อกซี่ http',
'proxy server list': u'รายการเซิร์ฟเวอร์พร็อกซี่',
'anonymous proxy': u'พร็อกซี่นิรนาม',
'elite proxy': u'พร็อกซี่ระดับสูง',
'fresh proxy': u'พร็อกซี่ใหม่',
'working proxy': u'พร็อกซี่ใช้งานได้',
'proxy list updated': u'รายการพร็อกซี่อัพเดท',
},
'pl': {
'free proxy list': u'darmowa lista proxy',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista serwerów proxy',
'anonymous proxy': u'anonimowe proxy',
'elite proxy': u'elitarne proxy',
'fresh proxy': u'świeże proxy',
'working proxy': u'działające proxy',
'proxy list updated': u'zaktualizowana lista proxy',
},
'uk': {
'free proxy list': u'безкоштовний список проксі',
'socks5 proxy': u'socks5 проксі',
'socks4 proxy': u'socks4 проксі',
'http proxy': u'http проксі',
'proxy server list': u'список проксі серверів',
'anonymous proxy': u'анонімний проксі',
'elite proxy': u'елітний проксі',
'fresh proxy': u'свіжі проксі',
'working proxy': u'робочі проксі',
'proxy list updated': u'оновлений список проксі',
},
}
# All available languages
LANGUAGES = list(STATIC_TRANSLATIONS.keys()) + ['en']
# LibreTranslate available languages (populated dynamically)
_libretranslate_langs = set()
_libretranslate_langs_checked = False
# Cache for online translations
_translation_cache = {}
_cache_file = 'translation_cache.json'
_cache_max_age = 86400 * 30 # 30 days
_failed_translations = {} # Track failed translations to avoid repeated API calls
_failed_cache_ttl = 3600 # 1 hour before retrying failed translations
def set_config(config):
"""Configure translation settings from config object.
Args:
config: Config object with scraper.libretranslate_url and
scraper.libretranslate_enabled attributes
"""
global _libretranslate_url, _libretranslate_enabled
if hasattr(config, 'scraper'):
if hasattr(config.scraper, 'libretranslate_url'):
_libretranslate_url = config.scraper.libretranslate_url
if hasattr(config.scraper, 'libretranslate_enabled'):
_libretranslate_enabled = config.scraper.libretranslate_enabled
if _libretranslate_enabled:
_fetch_available_languages()
_log('LibreTranslate: enabled (%s) - %d languages' % (
_libretranslate_url, len(_libretranslate_langs)
), 'info')
else:
_log('LibreTranslate: disabled', 'debug')
def _fetch_available_languages():
"""Fetch available languages from LibreTranslate API.
Queries the /languages endpoint and extracts languages that can be
translated from English (source='en').
"""
global _libretranslate_langs, _libretranslate_langs_checked
if _libretranslate_langs_checked:
return
_libretranslate_langs_checked = True
# Derive base URL from translate endpoint
base_url = _libretranslate_url.rsplit('/', 1)[0]
languages_url = base_url + '/languages'
try:
import urllib2
req = urllib2.Request(languages_url)
req.add_header('Accept', 'application/json')
req.add_header('User-Agent', 'PPF/1.0')
resp = urllib2.urlopen(req, timeout=_libretranslate_timeout)
langs = json.loads(resp.read())
# Find English entry to get available target languages
en_targets = set()
for lang in langs:
if lang.get('code') == 'en':
en_targets = set(lang.get('targets', []))
break
# Also collect all language codes as fallback
all_codes = set()
for lang in langs:
code = lang.get('code', '')
if code.startswith('zh'):
all_codes.add('zh')
elif code:
all_codes.add(code)
# Use English targets if available, otherwise all codes
if en_targets:
# Normalize zh variants
codes = set()
for code in en_targets:
if code.startswith('zh'):
codes.add('zh')
elif code:
codes.add(code)
codes.discard('en')
_libretranslate_langs = codes
else:
all_codes.discard('en')
_libretranslate_langs = all_codes
_log('LibreTranslate languages: %s' % ', '.join(sorted(_libretranslate_langs)), 'debug')
except Exception as e:
_log('failed to fetch LibreTranslate languages: %s' % str(e), 'warn')
_libretranslate_langs = set()
def _load_cache():
"""Load translation cache from disk."""
global _translation_cache
if os.path.exists(_cache_file):
try:
with open(_cache_file, 'r') as f:
data = json.load(f)
# Handle both old format (just translations) and new format (with metadata)
if isinstance(data, dict) and '_meta' in data:
_translation_cache = data.get('translations', {})
else:
_translation_cache = data
except (IOError, ValueError) as e:
_log('cache load failed: %s' % str(e), 'debug')
_translation_cache = {}
def _save_cache():
"""Save translation cache to disk."""
try:
data = {
'_meta': {
'version': 1,
'updated': int(time.time()),
'count': len(_translation_cache)
},
'translations': _translation_cache
}
with open(_cache_file, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except IOError as e:
_log('cache save failed: %s' % str(e), 'debug')
def translate_libretranslate(text, target_lang, source_lang='en', api_url=None):
"""Translate text using LibreTranslate API.
Args:
text: Text to translate
target_lang: Target language code (e.g., 'ru', 'zh')
source_lang: Source language code (default: 'en')
api_url: LibreTranslate API URL (uses configured default if None)
Returns:
Translated text or None on failure
"""
global _failed_translations
if not _libretranslate_enabled:
return None
if api_url is None:
api_url = _libretranslate_url
# Map language codes (e.g., zh -> zh-Hans for the API)
api_target = target_lang
if target_lang == 'zh':
api_target = 'zh-Hans'
# Check if target language is available (fetch if not checked yet)
if not _libretranslate_langs_checked:
_fetch_available_languages()
if target_lang not in _libretranslate_langs:
return None
cache_key = '%s:%s:%s' % (source_lang, target_lang, text)
# Check cache first
if cache_key in _translation_cache:
cached = _translation_cache[cache_key]
# Handle both old format (string) and new format (dict with timestamp)
if isinstance(cached, dict):
if time.time() - cached.get('time', 0) < _cache_max_age:
return cached.get('text')
else:
return cached
# Check if we recently failed this translation
if cache_key in _failed_translations:
if time.time() - _failed_translations[cache_key] < _failed_cache_ttl:
return None
try:
import urllib2
data = json.dumps({
'q': text,
'source': source_lang,
'target': api_target,
'format': 'text',
})
req = urllib2.Request(api_url, data)
req.add_header('Content-Type', 'application/json')
req.add_header('User-Agent', 'PPF/1.0')
resp = urllib2.urlopen(req, timeout=_libretranslate_timeout)
result = json.loads(resp.read())
if 'translatedText' in result:
translated = result['translatedText']
# Store with timestamp for cache expiry
_translation_cache[cache_key] = {
'text': translated,
'time': int(time.time())
}
_save_cache()
_log('translated [%s]: %s -> %s' % (target_lang, text, translated), 'debug')
return translated
except Exception as e:
_failed_translations[cache_key] = time.time()
_log('translation failed [%s->%s]: %s' % (source_lang, target_lang, str(e)), 'debug')
return None
def get_cache_stats():
"""Return cache statistics.
Returns:
dict with cache stats
"""
return {
'entries': len(_translation_cache),
'failed_pending': len(_failed_translations),
'file': _cache_file,
'enabled': _libretranslate_enabled,
'url': _libretranslate_url,
'api_languages': len(_libretranslate_langs),
}
def get_translated_term(term=None, lang=None, use_api=True):
"""Get a search term, optionally translated.
Args:
term: Specific term to translate (or random if None)
lang: Target language (or random if None)
use_api: Whether to use LibreTranslate API for missing translations
Returns:
(term, lang) tuple
"""
# Expand language list to include LibreTranslate-only languages
all_langs = list(set(LANGUAGES) | _libretranslate_langs)
if lang is None:
lang = random.choice(all_langs) if all_langs else 'en'
if lang == 'en':
# For English, use full term list
if term is None:
term = random.choice(BASE_TERMS)
return term, lang
# For other languages, pick from translatable terms
if term is None:
term = random.choice(TRANSLATABLE_TERMS)
# Try static translations first
if lang in STATIC_TRANSLATIONS:
translations = STATIC_TRANSLATIONS[lang]
if term in translations:
return translations[term], lang
# Try LibreTranslate API for missing translations
if use_api and _libretranslate_enabled and lang in _libretranslate_langs:
translated = translate_libretranslate(term, lang)
if translated:
return translated, lang
# Fall back to English with full term list
return random.choice(BASE_TERMS), 'en'
def get_random_search_term():
"""Get a random search term in a random language.
Returns:
Translated search term string
"""
term, lang = get_translated_term()
return term
def get_all_terms_for_language(lang):
"""Get all search terms for a specific language.
Args:
lang: Language code
Returns:
List of translated terms
"""
if lang == 'en':
return BASE_TERMS[:]
if lang in STATIC_TRANSLATIONS:
return list(STATIC_TRANSLATIONS[lang].values())
return BASE_TERMS[:]
def get_mixed_terms(count=5, english_weight=0.3):
"""Get a mix of terms from different languages.
Args:
count: Number of terms to return
english_weight: Probability of including English terms
Returns:
List of search terms in various languages
"""
terms = []
for _ in range(count):
if random.random() < english_weight:
terms.append(random.choice(BASE_TERMS))
else:
terms.append(get_random_search_term())
return terms
# Load cache on module import
_load_cache()
if __name__ == '__main__':
import sys
# Fetch available languages from API
_fetch_available_languages()
# Test output
print('LibreTranslate: %s' % ('enabled' if _libretranslate_enabled else 'disabled'))
print('API URL: %s' % _libretranslate_url)
print('Static languages: %s' % ', '.join(sorted(STATIC_TRANSLATIONS.keys())))
api_only = _libretranslate_langs - set(STATIC_TRANSLATIONS.keys())
print('API-only languages: %s' % (', '.join(sorted(api_only)) if api_only else 'none'))
print('')
# Cache stats
stats = get_cache_stats()
print('Cache: %d entries in %s' % (stats['entries'], stats['file']))
print('')
print('Sample static translations:')
for lang in sorted(STATIC_TRANSLATIONS.keys())[:5]:
term, _ = get_translated_term('free proxy list', lang, use_api=False)
if isinstance(term, unicode):
print(' [%s] %s' % (lang, term.encode('utf-8')))
else:
print(' [%s] %s' % (lang, term))
print('')
# Test LibreTranslate if --test-api flag
if '--test-api' in sys.argv:
print('Testing LibreTranslate API...')
# Use languages that are API-available but not in static translations
test_langs = list(api_only)[:5] if api_only else ['fr', 'ar']
for lang in test_langs:
term, result_lang = get_translated_term('free proxy list', lang, use_api=True)
if isinstance(term, unicode):
print(' [%s] %s' % (result_lang, term.encode('utf-8')))
else:
print(' [%s] %s' % (result_lang, term))
print('')
stats = get_cache_stats()
print('Cache after API test: %d entries' % stats['entries'])
else:
print('Run with --test-api to test LibreTranslate API')
print('')
print('Random mixed terms:')
for term in get_mixed_terms(10, english_weight=0.2):
if isinstance(term, unicode):
print(' ', term.encode('utf-8'))
else:
print(' ', term)