Files
ppf/translations.py
Username 8132023c97 translations: add multi-lingual search term generation
- Static translations for 15 languages (ru, zh, es, pt, de, fr, ja, ko, ar, id, tr, vi, th, pl, uk)
- LibreTranslate API integration with configurable endpoint
- Dynamic language detection from API /languages endpoint
- Persistent JSON cache with 30-day TTL
- Categorized search terms: generic, protocol, anonymity, freshness, format, sources, geographic, use-case, search operators
- Dynamic year substitution for freshness terms
2025-12-20 22:27:37 +01:00

675 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""Multi-lingual search term generation with LibreTranslate support."""
import random
import os
import json
import time
import datetime
from misc import _log
# Current year for freshness terms
CURRENT_YEAR = str(datetime.datetime.now().year)
# Module-level configuration (set by set_config())
_libretranslate_url = 'https://lt.mymx.me/translate'
_libretranslate_enabled = True
_libretranslate_timeout = 10
# Base English terms - categorized for better coverage
BASE_TERMS_GENERIC = [
'free proxy list',
'proxy server list',
'public proxy list',
'open proxy list',
]
BASE_TERMS_PROTOCOL = [
'socks5 proxy list',
'socks4 proxy list',
'http proxy list',
'https proxy list',
]
BASE_TERMS_ANONYMITY = [
'anonymous proxy',
'elite proxy',
'high anonymity proxy',
'transparent proxy list',
]
BASE_TERMS_FRESHNESS = [
'fresh proxy list',
'working proxy list',
'verified proxy list',
'checked proxy list',
'live proxy list',
'proxy list today',
'proxy list updated',
'proxy list ' + CURRENT_YEAR,
'new proxy list ' + CURRENT_YEAR,
]
BASE_TERMS_FORMAT = [
'proxy list txt',
'proxy list ip port',
'proxy list download',
'proxy txt file',
]
BASE_TERMS_SOURCES = [
'proxy pastebin',
'proxy github',
'proxy list telegram',
'free proxy api',
]
BASE_TERMS_GEOGRAPHIC = [
'US proxy list',
'USA proxy',
'Europe proxy list',
'Asia proxy list',
'Russia proxy list',
'China proxy list',
]
BASE_TERMS_USECASE = [
'proxy for scraping',
'fast proxy list',
'residential proxy list',
'datacenter proxy list',
]
BASE_TERMS_SEARCH_OPS = [
'filetype:txt proxy list',
'inurl:proxy.txt',
'inurl:proxies.txt',
'intitle:proxy list',
]
# Combined list for random selection
BASE_TERMS = (
BASE_TERMS_GENERIC +
BASE_TERMS_PROTOCOL +
BASE_TERMS_ANONYMITY +
BASE_TERMS_FRESHNESS +
BASE_TERMS_FORMAT +
BASE_TERMS_SOURCES +
BASE_TERMS_GEOGRAPHIC +
BASE_TERMS_USECASE +
BASE_TERMS_SEARCH_OPS
)
# Terms that should be translated (exclude search operators and technical terms)
TRANSLATABLE_TERMS = (
BASE_TERMS_GENERIC +
BASE_TERMS_ANONYMITY +
BASE_TERMS_FRESHNESS
)
# Static translations - no API needed
# Format: {lang_code: {english_term: translated_term}}
STATIC_TRANSLATIONS = {
'ru': {
'free proxy list': u'бесплатный список прокси',
'socks5 proxy': u'socks5 прокси',
'socks4 proxy': u'socks4 прокси',
'http proxy': u'http прокси',
'proxy server list': u'список прокси серверов',
'anonymous proxy': u'анонимный прокси',
'elite proxy': u'элитный прокси',
'fresh proxy': u'свежие прокси',
'working proxy': u'рабочие прокси',
'proxy list updated': u'обновленный список прокси',
},
'zh': {
'free proxy list': u'免费代理列表',
'socks5 proxy': u'socks5代理',
'socks4 proxy': u'socks4代理',
'http proxy': u'http代理',
'proxy server list': u'代理服务器列表',
'anonymous proxy': u'匿名代理',
'elite proxy': u'高匿代理',
'fresh proxy': u'最新代理',
'working proxy': u'可用代理',
'proxy list updated': u'代理列表更新',
},
'es': {
'free proxy list': u'lista de proxies gratis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista de servidores proxy',
'anonymous proxy': u'proxy anónimo',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxies frescos',
'working proxy': u'proxies funcionando',
'proxy list updated': u'lista de proxies actualizada',
},
'pt': {
'free proxy list': u'lista de proxy grátis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista de servidores proxy',
'anonymous proxy': u'proxy anônimo',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxies novos',
'working proxy': u'proxies funcionando',
'proxy list updated': u'lista de proxy atualizada',
},
'de': {
'free proxy list': u'kostenlose Proxy-Liste',
'socks5 proxy': u'socks5 Proxy',
'socks4 proxy': u'socks4 Proxy',
'http proxy': u'http Proxy',
'proxy server list': u'Proxy-Server-Liste',
'anonymous proxy': u'anonymer Proxy',
'elite proxy': u'Elite-Proxy',
'fresh proxy': u'frische Proxys',
'working proxy': u'funktionierende Proxys',
'proxy list updated': u'aktualisierte Proxy-Liste',
},
'fr': {
'free proxy list': u'liste de proxy gratuit',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'liste de serveurs proxy',
'anonymous proxy': u'proxy anonyme',
'elite proxy': u'proxy élite',
'fresh proxy': u'proxies frais',
'working proxy': u'proxies fonctionnels',
'proxy list updated': u'liste de proxy mise à jour',
},
'ja': {
'free proxy list': u'無料プロキシリスト',
'socks5 proxy': u'socks5プロキシ',
'socks4 proxy': u'socks4プロキシ',
'http proxy': u'httpプロキシ',
'proxy server list': u'プロキシサーバーリスト',
'anonymous proxy': u'匿名プロキシ',
'elite proxy': u'エリートプロキシ',
'fresh proxy': u'最新プロキシ',
'working proxy': u'動作するプロキシ',
'proxy list updated': u'プロキシリスト更新',
},
'ko': {
'free proxy list': u'무료 프록시 목록',
'socks5 proxy': u'socks5 프록시',
'socks4 proxy': u'socks4 프록시',
'http proxy': u'http 프록시',
'proxy server list': u'프록시 서버 목록',
'anonymous proxy': u'익명 프록시',
'elite proxy': u'엘리트 프록시',
'fresh proxy': u'최신 프록시',
'working proxy': u'작동하는 프록시',
'proxy list updated': u'프록시 목록 업데이트',
},
'ar': {
'free proxy list': u'قائمة بروكسي مجانية',
'socks5 proxy': u'بروكسي socks5',
'socks4 proxy': u'بروكسي socks4',
'http proxy': u'بروكسي http',
'proxy server list': u'قائمة خوادم البروكسي',
'anonymous proxy': u'بروكسي مجهول',
'elite proxy': u'بروكسي نخبة',
'fresh proxy': u'بروكسي جديد',
'working proxy': u'بروكسي يعمل',
'proxy list updated': u'قائمة بروكسي محدثة',
},
'id': {
'free proxy list': u'daftar proxy gratis',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'daftar server proxy',
'anonymous proxy': u'proxy anonim',
'elite proxy': u'proxy elite',
'fresh proxy': u'proxy baru',
'working proxy': u'proxy aktif',
'proxy list updated': u'daftar proxy diperbarui',
},
'tr': {
'free proxy list': u'ücretsiz proxy listesi',
'socks5 proxy': u'socks5 proxy',
'socks4 proxy': u'socks4 proxy',
'http proxy': u'http proxy',
'proxy server list': u'proxy sunucu listesi',
'anonymous proxy': u'anonim proxy',
'elite proxy': u'elit proxy',
'fresh proxy': u'güncel proxy',
'working proxy': u'çalışan proxy',
'proxy list updated': u'güncellenmiş proxy listesi',
},
'vi': {
'free proxy list': u'danh sách proxy miễn phí',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'danh sách máy chủ proxy',
'anonymous proxy': u'proxy ẩn danh',
'elite proxy': u'proxy cao cấp',
'fresh proxy': u'proxy mới',
'working proxy': u'proxy hoạt động',
'proxy list updated': u'danh sách proxy cập nhật',
},
'th': {
'free proxy list': u'รายการพร็อกซี่ฟรี',
'socks5 proxy': u'พร็อกซี่ socks5',
'socks4 proxy': u'พร็อกซี่ socks4',
'http proxy': u'พร็อกซี่ http',
'proxy server list': u'รายการเซิร์ฟเวอร์พร็อกซี่',
'anonymous proxy': u'พร็อกซี่นิรนาม',
'elite proxy': u'พร็อกซี่ระดับสูง',
'fresh proxy': u'พร็อกซี่ใหม่',
'working proxy': u'พร็อกซี่ใช้งานได้',
'proxy list updated': u'รายการพร็อกซี่อัพเดท',
},
'pl': {
'free proxy list': u'darmowa lista proxy',
'socks5 proxy': u'proxy socks5',
'socks4 proxy': u'proxy socks4',
'http proxy': u'proxy http',
'proxy server list': u'lista serwerów proxy',
'anonymous proxy': u'anonimowe proxy',
'elite proxy': u'elitarne proxy',
'fresh proxy': u'świeże proxy',
'working proxy': u'działające proxy',
'proxy list updated': u'zaktualizowana lista proxy',
},
'uk': {
'free proxy list': u'безкоштовний список проксі',
'socks5 proxy': u'socks5 проксі',
'socks4 proxy': u'socks4 проксі',
'http proxy': u'http проксі',
'proxy server list': u'список проксі серверів',
'anonymous proxy': u'анонімний проксі',
'elite proxy': u'елітний проксі',
'fresh proxy': u'свіжі проксі',
'working proxy': u'робочі проксі',
'proxy list updated': u'оновлений список проксі',
},
}
# All available languages
LANGUAGES = list(STATIC_TRANSLATIONS.keys()) + ['en']
# LibreTranslate available languages (populated dynamically)
_libretranslate_langs = set()
_libretranslate_langs_checked = False
# Cache for online translations
_translation_cache = {}
_cache_file = 'translation_cache.json'
_cache_max_age = 86400 * 30 # 30 days
_failed_translations = {} # Track failed translations to avoid repeated API calls
_failed_cache_ttl = 3600 # 1 hour before retrying failed translations
def set_config(config):
"""Configure translation settings from config object.
Args:
config: Config object with scraper.libretranslate_url and
scraper.libretranslate_enabled attributes
"""
global _libretranslate_url, _libretranslate_enabled
if hasattr(config, 'scraper'):
if hasattr(config.scraper, 'libretranslate_url'):
_libretranslate_url = config.scraper.libretranslate_url
if hasattr(config.scraper, 'libretranslate_enabled'):
_libretranslate_enabled = config.scraper.libretranslate_enabled
if _libretranslate_enabled:
_fetch_available_languages()
_log('LibreTranslate: enabled (%s) - %d languages' % (
_libretranslate_url, len(_libretranslate_langs)
), 'info')
else:
_log('LibreTranslate: disabled', 'debug')
def _fetch_available_languages():
"""Fetch available languages from LibreTranslate API.
Queries the /languages endpoint and extracts languages that can be
translated from English (source='en').
"""
global _libretranslate_langs, _libretranslate_langs_checked
if _libretranslate_langs_checked:
return
_libretranslate_langs_checked = True
# Derive base URL from translate endpoint
base_url = _libretranslate_url.rsplit('/', 1)[0]
languages_url = base_url + '/languages'
try:
import urllib2
req = urllib2.Request(languages_url)
req.add_header('Accept', 'application/json')
req.add_header('User-Agent', 'PPF/1.0')
resp = urllib2.urlopen(req, timeout=_libretranslate_timeout)
langs = json.loads(resp.read())
# Find English entry to get available target languages
en_targets = set()
for lang in langs:
if lang.get('code') == 'en':
en_targets = set(lang.get('targets', []))
break
# Also collect all language codes as fallback
all_codes = set()
for lang in langs:
code = lang.get('code', '')
if code.startswith('zh'):
all_codes.add('zh')
elif code:
all_codes.add(code)
# Use English targets if available, otherwise all codes
if en_targets:
# Normalize zh variants
codes = set()
for code in en_targets:
if code.startswith('zh'):
codes.add('zh')
elif code:
codes.add(code)
codes.discard('en')
_libretranslate_langs = codes
else:
all_codes.discard('en')
_libretranslate_langs = all_codes
_log('LibreTranslate languages: %s' % ', '.join(sorted(_libretranslate_langs)), 'debug')
except Exception as e:
_log('failed to fetch LibreTranslate languages: %s' % str(e), 'warn')
_libretranslate_langs = set()
def _load_cache():
"""Load translation cache from disk."""
global _translation_cache
if os.path.exists(_cache_file):
try:
with open(_cache_file, 'r') as f:
data = json.load(f)
# Handle both old format (just translations) and new format (with metadata)
if isinstance(data, dict) and '_meta' in data:
_translation_cache = data.get('translations', {})
else:
_translation_cache = data
except (IOError, ValueError) as e:
_log('cache load failed: %s' % str(e), 'debug')
_translation_cache = {}
def _save_cache():
"""Save translation cache to disk."""
try:
data = {
'_meta': {
'version': 1,
'updated': int(time.time()),
'count': len(_translation_cache)
},
'translations': _translation_cache
}
with open(_cache_file, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except IOError as e:
_log('cache save failed: %s' % str(e), 'debug')
def translate_libretranslate(text, target_lang, source_lang='en', api_url=None):
"""Translate text using LibreTranslate API.
Args:
text: Text to translate
target_lang: Target language code (e.g., 'ru', 'zh')
source_lang: Source language code (default: 'en')
api_url: LibreTranslate API URL (uses configured default if None)
Returns:
Translated text or None on failure
"""
global _failed_translations
if not _libretranslate_enabled:
return None
if api_url is None:
api_url = _libretranslate_url
# Map language codes (e.g., zh -> zh-Hans for the API)
api_target = target_lang
if target_lang == 'zh':
api_target = 'zh-Hans'
# Check if target language is available (fetch if not checked yet)
if not _libretranslate_langs_checked:
_fetch_available_languages()
if target_lang not in _libretranslate_langs:
return None
cache_key = '%s:%s:%s' % (source_lang, target_lang, text)
# Check cache first
if cache_key in _translation_cache:
cached = _translation_cache[cache_key]
# Handle both old format (string) and new format (dict with timestamp)
if isinstance(cached, dict):
if time.time() - cached.get('time', 0) < _cache_max_age:
return cached.get('text')
else:
return cached
# Check if we recently failed this translation
if cache_key in _failed_translations:
if time.time() - _failed_translations[cache_key] < _failed_cache_ttl:
return None
try:
import urllib2
data = json.dumps({
'q': text,
'source': source_lang,
'target': api_target,
'format': 'text',
})
req = urllib2.Request(api_url, data)
req.add_header('Content-Type', 'application/json')
req.add_header('User-Agent', 'PPF/1.0')
resp = urllib2.urlopen(req, timeout=_libretranslate_timeout)
result = json.loads(resp.read())
if 'translatedText' in result:
translated = result['translatedText']
# Store with timestamp for cache expiry
_translation_cache[cache_key] = {
'text': translated,
'time': int(time.time())
}
_save_cache()
_log('translated [%s]: %s -> %s' % (target_lang, text, translated), 'debug')
return translated
except Exception as e:
_failed_translations[cache_key] = time.time()
_log('translation failed [%s->%s]: %s' % (source_lang, target_lang, str(e)), 'debug')
return None
def get_cache_stats():
"""Return cache statistics.
Returns:
dict with cache stats
"""
return {
'entries': len(_translation_cache),
'failed_pending': len(_failed_translations),
'file': _cache_file,
'enabled': _libretranslate_enabled,
'url': _libretranslate_url,
'api_languages': len(_libretranslate_langs),
}
def get_translated_term(term=None, lang=None, use_api=True):
"""Get a search term, optionally translated.
Args:
term: Specific term to translate (or random if None)
lang: Target language (or random if None)
use_api: Whether to use LibreTranslate API for missing translations
Returns:
(term, lang) tuple
"""
# Expand language list to include LibreTranslate-only languages
all_langs = list(set(LANGUAGES) | _libretranslate_langs)
if lang is None:
lang = random.choice(all_langs) if all_langs else 'en'
if lang == 'en':
# For English, use full term list
if term is None:
term = random.choice(BASE_TERMS)
return term, lang
# For other languages, pick from translatable terms
if term is None:
term = random.choice(TRANSLATABLE_TERMS)
# Try static translations first
if lang in STATIC_TRANSLATIONS:
translations = STATIC_TRANSLATIONS[lang]
if term in translations:
return translations[term], lang
# Try LibreTranslate API for missing translations
if use_api and _libretranslate_enabled and lang in _libretranslate_langs:
translated = translate_libretranslate(term, lang)
if translated:
return translated, lang
# Fall back to English with full term list
return random.choice(BASE_TERMS), 'en'
def get_random_search_term():
"""Get a random search term in a random language.
Returns:
Translated search term string
"""
term, lang = get_translated_term()
return term
def get_all_terms_for_language(lang):
"""Get all search terms for a specific language.
Args:
lang: Language code
Returns:
List of translated terms
"""
if lang == 'en':
return BASE_TERMS[:]
if lang in STATIC_TRANSLATIONS:
return list(STATIC_TRANSLATIONS[lang].values())
return BASE_TERMS[:]
def get_mixed_terms(count=5, english_weight=0.3):
"""Get a mix of terms from different languages.
Args:
count: Number of terms to return
english_weight: Probability of including English terms
Returns:
List of search terms in various languages
"""
terms = []
for _ in range(count):
if random.random() < english_weight:
terms.append(random.choice(BASE_TERMS))
else:
terms.append(get_random_search_term())
return terms
# Load cache on module import
_load_cache()
if __name__ == '__main__':
import sys
# Fetch available languages from API
_fetch_available_languages()
# Test output
print('LibreTranslate: %s' % ('enabled' if _libretranslate_enabled else 'disabled'))
print('API URL: %s' % _libretranslate_url)
print('Static languages: %s' % ', '.join(sorted(STATIC_TRANSLATIONS.keys())))
api_only = _libretranslate_langs - set(STATIC_TRANSLATIONS.keys())
print('API-only languages: %s' % (', '.join(sorted(api_only)) if api_only else 'none'))
print('')
# Cache stats
stats = get_cache_stats()
print('Cache: %d entries in %s' % (stats['entries'], stats['file']))
print('')
print('Sample static translations:')
for lang in sorted(STATIC_TRANSLATIONS.keys())[:5]:
term, _ = get_translated_term('free proxy list', lang, use_api=False)
if isinstance(term, unicode):
print(' [%s] %s' % (lang, term.encode('utf-8')))
else:
print(' [%s] %s' % (lang, term))
print('')
# Test LibreTranslate if --test-api flag
if '--test-api' in sys.argv:
print('Testing LibreTranslate API...')
# Use languages that are API-available but not in static translations
test_langs = list(api_only)[:5] if api_only else ['fr', 'ar']
for lang in test_langs:
term, result_lang = get_translated_term('free proxy list', lang, use_api=True)
if isinstance(term, unicode):
print(' [%s] %s' % (result_lang, term.encode('utf-8')))
else:
print(' [%s] %s' % (result_lang, term))
print('')
stats = get_cache_stats()
print('Cache after API test: %d entries' % stats['entries'])
else:
print('Run with --test-api to test LibreTranslate API')
print('')
print('Random mixed terms:')
for term in get_mixed_terms(10, english_weight=0.2):
if isinstance(term, unicode):
print(' ', term.encode('utf-8'))
else:
print(' ', term)