refactor core modules, integrate network stats

2025-12-25 11:13:20 +01:00
parent 2201515b10
commit 269fed55ff
8 changed files with 270 additions and 219 deletions
--- a/ppf.py
+++ b/ppf.py
@@ -11,7 +11,7 @@ from misc import _log
 from config import Config
 import fetch
 import sys
-from soup_parser import soupify, set_nobs
+from soup_parser import set_nobs
 import re
 import threading
 import random
@@ -49,15 +49,17 @@ def format_duration(seconds):
        return '%dd %dh' % (d, h) if h else '%dd' % d


-def import_from_file(fn, sqlite):
-    with open(fn, 'r') as f:
-        urls = [ url for url in f.read().split('\n') if url ]
-        cinc = 0
-        while True:
-            chunk = urls[cinc:cinc+200]
-            if chunk: dbs.insert_urls(chunk, 'import.txt', urldb)
-            else: break
-            cinc = cinc + 200
+def import_from_file(fn, urldb):
+    """Import URLs from a text file into the database."""
+    try:
+        with open(fn, 'r') as f:
+            urls = [url.strip() for url in f if url.strip()]
+    except IOError:
+        return  # File not found, silently skip
+    for i in range(0, len(urls), 200):
+        chunk = urls[i:i+200]
+        if chunk:
+            dbs.insert_urls(chunk, 'import.txt', urldb)


 def get_content_type(url, proxy):
@@ -74,38 +76,6 @@ def is_good_content_type(string):
        if ct.lower() in string.lower(): return True
    return False

-def is_bad_url(uri, domain=None, samedomain=False):
-    # if uri needs to be from same domain and domains missmatch
-    if samedomain and str(uri.split('/')[2]).lower() != str(domain).lower():
-        return True
-    for u in urignore:
-        if re.findall(u, uri): return True
-    return False
-
-def extract_urls(html, url):
-    mytime = int(time.time())
-    proto = url.split(':')[0]
-    domain = url.split('/')[2]
-    urls = []
-
-    soup = soupify(html, nohtml=True)
-
-    for a in soup.find_all('a', href=True):
-        item = a['href'].encode('utf-8') if isinstance(a['href'], unicode) else a['href']
-        item = item.strip()
-
-        if item.startswith('www.'):
-            item = 'http://%s' % item
-        elif not item.startswith('http'):
-            if not item.startswith('/'): item = '/%s' % item
-            item = '%s://%s%s' % (proto,domain,item)
-
-        elif is_bad_url(item, domain=domain, samedomain=config.ppf.extract_samedomain):
-            continue
-        if not item in urls: urls.append(item)
-
-    if urls: dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls)
-
 def import_proxies_from_file(proxydb, fn):
    content = open(fn, 'r').read()
    # Detect protocol from filename (e.g., socks5.txt, http-proxies.txt)
@@ -142,84 +112,97 @@ class Leechered(threading.Thread):
    def run(self):
        self.status = 'nok'

-        if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)
+        try:
+            if not self.content_type: self.content_type = get_content_type(self.url, self.proxy)

-        if is_good_content_type(self.content_type):
-            try:
-                content = fetch.fetch_contents(self.url, proxy=self.proxy)
-            except KeyboardInterrupt as e:
-                raise e
-            except Exception as e:
+            if is_good_content_type(self.content_type):
                try:
-                    err_msg = repr(e)
-                    if isinstance(err_msg, unicode):
-                        err_msg = err_msg.encode('ascii', 'backslashreplace')
-                except:
-                    err_msg = type(e).__name__
-                _log('%s: fetch error: %s' % (self.url.split('/')[2], err_msg), 'error')
+                    content = fetch.fetch_contents(self.url, proxy=self.proxy)
+                except KeyboardInterrupt as e:
+                    raise e
+                except Exception as e:
+                    try:
+                        err_msg = repr(e)
+                        if isinstance(err_msg, unicode):
+                            err_msg = err_msg.encode('ascii', 'backslashreplace')
+                    except:
+                        err_msg = type(e).__name__
+                    _log('%s: fetch error: %s' % (self.url.split('/')[2], err_msg), 'error')
+                    content = ''
+            else:
                content = ''
-        else:
-            content = ''

-        # Detect protocol from source URL (e.g., .../socks5/list.txt)
-        proto = fetch.detect_proto_from_path(self.url)
-        unique = fetch.extract_proxies(content, filter_known=False, proto=proto)
+            # Detect protocol from source URL (e.g., .../socks5/list.txt)
+            proto = fetch.detect_proto_from_path(self.url)
+            unique = fetch.extract_proxies(content, filter_known=False, proto=proto)

-        # Compute hash of all extracted proxies for change detection
-        self.new_hash = dbs.compute_proxy_list_hash(unique)
+            # Compute hash of all extracted proxies for change detection
+            self.new_hash = dbs.compute_proxy_list_hash(unique)

-        # Check if content unchanged (same proxies as last time)
-        if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
-            self.hash_unchanged = True
-            self.proxylist = []
-            self.stale_count += 1
-            next_check = config.ppf.checktime + (self.error + self.stale_count) * config.ppf.perfail_checktime
-            _log('%s: unchanged (hash match), next in %s' % (self.url.split('/')[2], format_duration(next_check)), 'stale')
-            # Content unchanged - increment stale_count, update check_time
-            self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
+            # Check if content unchanged (same proxies as last time)
+            if self.new_hash and self.content_hash and self.new_hash == self.content_hash:
+                self.hash_unchanged = True
+                self.proxylist = []
+                self.stale_count += 1
+                next_check = config.ppf.checktime + (self.error + self.stale_count) * config.ppf.perfail_checktime
+                _log('%s: unchanged (hash match), next in %s' % (self.url.split('/')[2], format_duration(next_check)), 'stale')
+                # Content unchanged - increment stale_count, update check_time
+                self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
+                self.status = 'ok'
+                return
+
+            # Content changed or first fetch - reset stale_count, proceed with normal processing
+            self.stale_count = 0
+            # unique is list of (address, proto) tuples; filter by address, keep tuple
+            self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
+            proxy_count = len(self.proxylist)
+
+            if self.retrievals == 0:    # new site
+                if content and not self.proxylist: # site works but has zero proxy addresses
+                    self.error += 1
+                    self.stale_count += 1
+                elif proxy_count:
+                    self.error = 0
+                    self.stale_count = 0
+                else:
+                    self.error += 2
+                    self.stale_count += 2
+            else:                                        # not a new site
+                # proxylist is empty
+                if not proxy_count:
+                    self.stale_count += 1
+                # proxylist is not empty: site is working
+                else:
+                    self.stale_count = 0
+                    self.error = 0
+                # site has no content
+                if not content:
+                    self.error += 1
+                    self.stale_count += 1
+                # site has proxies
+                if proxy_count:
+                    self.error = 0
+                    self.stale_count = 0
+
+            self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
            self.status = 'ok'
-            return

-        # Content changed or first fetch - reset stale_count, proceed with normal processing
-        self.stale_count = 0
-        # unique is list of (address, proto) tuples; filter by address, keep tuple
-        self.proxylist = [(addr, pr) for addr, pr in unique if not fetch.is_known_proxy(addr)]
-        proxy_count = len(self.proxylist)
-
-        if self.retrievals == 0:    # new site
-            if content and not self.proxylist: # site works but has zero proxy addresses
-                self.error += 1
-                self.stale_count += 1
-            elif proxy_count:
-                self.error = 0
-                self.stale_count = 0
-            else:
-                self.error += 2
-                self.stale_count += 2
-        else:                                        # not a new site
-            # proxylist is empty
-            if not proxy_count:
-                self.stale_count += 1
-            # proxylist is not empty: site is working
-            else:
-                self.stale_count = 0
-                self.error = 0
-            # site has no content
-            if not content:
-                self.error += 1
-                self.stale_count += 1
-            #else:
-            #   self.retrievals += 1
-            #   self.error = 0
-            #   self.stale_count = 0
-            # site has proxies
-            if proxy_count:
-                self.error = 0
-                self.stale_count = 0
-                extract_urls(content, self.url)
-
-        self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added+len(self.proxylist), self.content_type, self.url)
-        self.status = 'ok'
+        except KeyboardInterrupt:
+            raise
+        except Exception as e:
+            try:
+                host = self.url.split('/')[2] if '/' in self.url else self.url
+                err_msg = repr(e)
+                if isinstance(err_msg, unicode):
+                    err_msg = err_msg.encode('ascii', 'backslashreplace')
+            except:
+                host = 'unknown'
+                err_msg = type(e).__name__
+            _log('%s: thread error: %s' % (host, err_msg), 'error')
+            # Set error state so site gets retried later
+            self.error += 1
+            self.execute = (self.error, self.stale_count, int(time.time()), self.retrievals, self.proxies_added, self.content_type, self.url)
+            self.status = 'nok'


 def main():
@@ -247,12 +230,15 @@ def main():
    else:
        watcherd = None

-    # start scraper if enabled
-    scraperd = None
+    # start scraper threads if enabled
+    scrapers = []
    if config.scraper.enabled:
        import scraper
-        scraperd = scraper.Scraper(config)
-        scraperd.start()
+        for i in range(config.scraper.threads):
+            s = scraper.Scraper(config)
+            s.start()
+            scrapers.append(s)
+        _log('started %d scraper thread(s)' % len(scrapers), 'info')

    qurl = 'SELECT url,stale_count,error,retrievals,proxies_added,content_type,content_hash FROM uris WHERE error < ? and (check_time+?+((error+stale_count)*?) <?) ORDER BY RANDOM()'
    threads = []
@@ -305,8 +291,8 @@ def main():
                t.start()

        except KeyboardInterrupt:
-            if scraperd:
-                scraperd.stop()
+            for s in scrapers:
+                s.stop()
            if watcherd:
                watcherd.stop()
                watcherd.finish()
@@ -328,7 +314,7 @@ if __name__ == '__main__':
    if config.args.nobs:
        set_nobs(True)

-    if config.args.profile:
+    if config.args.profile or config.common.profiling:
        _log('profiling enabled, output to data/profile.stats', 'info')
        _profiler = cProfile.Profile()
        try: