refactor core modules, integrate network stats
This commit is contained in:
224
ppf.py
224
ppf.py
@@ -11,7 +11,7 @@ from misc import _log
|
||||
from config import Config
|
||||
import fetch
|
||||
import sys
|
||||
from soup_parser import soupify, set_nobs
|
||||
from soup_parser import set_nobs
|
||||
import re
|
||||
import threading
|
||||
import random
|
||||
@@ -49,15 +49,17 @@ def format_duration(seconds):
|
||||
return '%dd %dh' % (d, h) if h else '%dd' % d
|
||||
|
||||
|
||||
def import_from_file(fn, sqlite):
|
||||
with open(fn, 'r') as f:
|
||||
urls = [ url for url in f.read().split('\n') if url ]
|
||||
cinc = 0
|
||||
while True:
|
||||
chunk = urls[cinc:cinc+200]
|
||||
if chunk: dbs.insert_urls(chunk, 'import.txt', urldb)
|
||||
else: break
|
||||
cinc = cinc + 200
|
||||
def import_from_file(fn, urldb):
|
||||
"""Import URLs from a text file into the database."""
|
||||
try:
|
||||
with open(fn, 'r') as f:
|
||||
urls = [url.strip() for url in f if url.strip()]
|
||||
except IOError:
|
||||
return # File not found, silently skip
|
||||
for i in range(0, len(urls), 200):
|
||||
chunk = urls[i:i+200]
|
||||
if chunk:
|
||||
dbs.insert_urls(chunk, 'import.txt', urldb)
|
||||
|
||||
|
||||
def get_content_type(url, proxy):
|
||||
@@ -74,38 +76,6 @@ def is_good_content_type(string):
|
||||
if ct.lower() in string.lower(): return True
|
||||
return False
|
||||
|
||||
def is_bad_url(uri, domain=None, samedomain=False):
|
||||
# if uri needs to be from same domain and domains missmatch
|
||||
if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
|
||||
return True
|
||||
for u in urignore:
|
||||
if re.findall(u, uri): return True
|
||||
return False
|
||||
|
||||
def extract_urls(html, url):
|
||||
mytime = int(time.time())
|
||||
proto = url.split(':')[0]
|
||||
domain = url.split('/')[2]
|
||||
urls = []
|
||||
|
||||
soup = soupify(html, nohtml=True)
|
||||
|
||||
for a in soup.find_all('a', href=True):
|
||||
item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
|
||||
item = item.strip()
|
||||
|
||||
if item.startswith('www.'):
|
||||
item = 'http://%s' % item
|
||||
elif not item.startswith('http'):
|
||||
if not item.startswith('/'): item = '/%s' % item
|
||||
item = '%s://%s%s' % (proto,domain,item)
|
||||
|
||||
elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
|
||||
continue
|
||||
if not item in urls: urls.append(item)
|
||||
|
||||
if urls: dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
|
||||
|
||||
def import_proxies_from_file(proxydb, fn):
|
||||
content = open(fn, 'r').read()
|
||||
# Detect protocol from filename (e.g., socks5.txt, http-proxies.txt)
|
||||
@@ -142,84 +112,97 @@ class Leechered(threading.Thread):
|
||||
def run(self):
|
||||
self.status = 'nok'
|
||||
|
||||
if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
|
||||
try:
|
||||
if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
|
||||
|
||||
if is_good_content_type(self.content_type):
|
||||
try:
|
||||
content = fetch.fetch_contents(self.url, proxy=self.proxy)
|
||||
except KeyboardInterrupt as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
if is_good_content_type(self.content_type):
|
||||
try:
|
||||
err_msg = repr(e)
|
||||
if isinstance(err_msg, unicode):
|
||||
err_msg = err_msg.encode('ascii', 'backslashreplace')
|
||||
except:
|
||||
err_msg = type(e).__name__
|
||||
_log('%s: fetch error: %s' % (self.url.split('/')[2], err_msg), 'error')
|
||||
content = fetch.fetch_contents(self.url, proxy=self.proxy)
|
||||
except KeyboardInterrupt as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
try:
|
||||
err_msg = repr(e)
|
||||
if isinstance(err_msg, unicode):
|
||||
err_msg = err_msg.encode('ascii', 'backslashreplace')
|
||||
except:
|
||||
err_msg = type(e).__name__
|
||||
_log('%s: fetch error: %s' % (self.url.split('/')[2], err_msg), 'error')
|
||||
content = ''
|
||||
else:
|
||||
content = ''
|
||||
else:
|
||||
content = ''
|
||||
|
||||
# Detect protocol from source URL (e.g., .../socks5/list.txt)
|
||||
proto = fetch.detect_proto_from_path(self.url)
|
||||
unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
|
||||
# Detect protocol from source URL (e.g., .../socks5/list.txt)
|
||||
proto = fetch.detect_proto_from_path(self.url)
|
||||
unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
|
||||
|
||||
# Compute hash of all extracted proxies for change detection
|
||||
self.new_hash = dbs.compute_proxy_list_hash(unique)
|
||||
# Compute hash of all extracted proxies for change detection
|
||||
self.new_hash = dbs.compute_proxy_list_hash(unique)
|
||||
|
||||
# Check if content unchanged (same proxies as last time)
|
||||
if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
|
||||
self.hash_unchanged = True
|
||||
self.proxylist = []
|
||||
self.stale_count += 1
|
||||
next_check = config.ppf.checktime + (self.error + self.stale_count) * config.ppf.perfail_checktime
|
||||
_log('%s: unchanged (hash match), next in %s' % (self.url.split('/')[2], format_duration(next_check)), 'stale')
|
||||
# Content unchanged - increment stale_count, update check_time
|
||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
|
||||
# Check if content unchanged (same proxies as last time)
|
||||
if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
|
||||
self.hash_unchanged = True
|
||||
self.proxylist = []
|
||||
self.stale_count += 1
|
||||
next_check = config.ppf.checktime + (self.error + self.stale_count) * config.ppf.perfail_checktime
|
||||
_log('%s: unchanged (hash match), next in %s' % (self.url.split('/')[2], format_duration(next_check)), 'stale')
|
||||
# Content unchanged - increment stale_count, update check_time
|
||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
|
||||
self.status = 'ok'
|
||||
return
|
||||
|
||||
# Content changed or first fetch - reset stale_count, proceed with normal processing
|
||||
self.stale_count = 0
|
||||
# unique is list of (address, proto) tuples; filter by address, keep tuple
|
||||
self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
|
||||
proxy_count = len(self.proxylist)
|
||||
|
||||
if self.retrievals == 0: # new site
|
||||
if content and not self.proxylist: # site works but has zero proxy addresses
|
||||
self.error += 1
|
||||
self.stale_count += 1
|
||||
elif proxy_count:
|
||||
self.error = 0
|
||||
self.stale_count = 0
|
||||
else:
|
||||
self.error += 2
|
||||
self.stale_count += 2
|
||||
else: # not a new site
|
||||
# proxylist is empty
|
||||
if not proxy_count:
|
||||
self.stale_count += 1
|
||||
# proxylist is not empty: site is working
|
||||
else:
|
||||
self.stale_count = 0
|
||||
self.error = 0
|
||||
# site has no content
|
||||
if not content:
|
||||
self.error += 1
|
||||
self.stale_count += 1
|
||||
# site has proxies
|
||||
if proxy_count:
|
||||
self.error = 0
|
||||
self.stale_count = 0
|
||||
|
||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
|
||||
self.status = 'ok'
|
||||
return
|
||||
|
||||
# Content changed or first fetch - reset stale_count, proceed with normal processing
|
||||
self.stale_count = 0
|
||||
# unique is list of (address, proto) tuples; filter by address, keep tuple
|
||||
self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
|
||||
proxy_count = len(self.proxylist)
|
||||
|
||||
if self.retrievals == 0: # new site
|
||||
if content and not self.proxylist: # site works but has zero proxy addresses
|
||||
self.error += 1
|
||||
self.stale_count += 1
|
||||
elif proxy_count:
|
||||
self.error = 0
|
||||
self.stale_count = 0
|
||||
else:
|
||||
self.error += 2
|
||||
self.stale_count += 2
|
||||
else: # not a new site
|
||||
# proxylist is empty
|
||||
if not proxy_count:
|
||||
self.stale_count += 1
|
||||
# proxylist is not empty: site is working
|
||||
else:
|
||||
self.stale_count = 0
|
||||
self.error = 0
|
||||
# site has no content
|
||||
if not content:
|
||||
self.error += 1
|
||||
self.stale_count += 1
|
||||
#else:
|
||||
# self.retrievals += 1
|
||||
# self.error = 0
|
||||
# self.stale_count = 0
|
||||
# site has proxies
|
||||
if proxy_count:
|
||||
self.error = 0
|
||||
self.stale_count = 0
|
||||
extract_urls(content, self.url)
|
||||
|
||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
|
||||
self.status = 'ok'
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
try:
|
||||
host = self.url.split('/')[2] if '/' in self.url else self.url
|
||||
err_msg = repr(e)
|
||||
if isinstance(err_msg, unicode):
|
||||
err_msg = err_msg.encode('ascii', 'backslashreplace')
|
||||
except:
|
||||
host = 'unknown'
|
||||
err_msg = type(e).__name__
|
||||
_log('%s: thread error: %s' % (host, err_msg), 'error')
|
||||
# Set error state so site gets retried later
|
||||
self.error += 1
|
||||
self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
|
||||
self.status = 'nok'
|
||||
|
||||
|
||||
def main():
|
||||
@@ -247,12 +230,15 @@ def main():
|
||||
else:
|
||||
watcherd = None
|
||||
|
||||
# start scraper if enabled
|
||||
scraperd = None
|
||||
# start scraper threads if enabled
|
||||
scrapers = []
|
||||
if config.scraper.enabled:
|
||||
import scraper
|
||||
scraperd = scraper.Scraper(config)
|
||||
scraperd.start()
|
||||
for i in range(config.scraper.threads):
|
||||
s = scraper.Scraper(config)
|
||||
s.start()
|
||||
scrapers.append(s)
|
||||
_log('started %d scraper thread(s)' % len(scrapers), 'info')
|
||||
|
||||
qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type,content_hash FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
|
||||
threads = []
|
||||
@@ -305,8 +291,8 @@ def main():
|
||||
t.start()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
if scraperd:
|
||||
scraperd.stop()
|
||||
for s in scrapers:
|
||||
s.stop()
|
||||
if watcherd:
|
||||
watcherd.stop()
|
||||
watcherd.finish()
|
||||
@@ -328,7 +314,7 @@ if __name__ == '__main__':
|
||||
if config.args.nobs:
|
||||
set_nobs(True)
|
||||
|
||||
if config.args.profile:
|
||||
if config.args.profile or config.common.profiling:
|
||||
_log('profiling enabled, output to data/profile.stats', 'info')
|
||||
_profiler = cProfile.Profile()
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user