fetch, dbs: minor refactoring
This commit is contained in:
2
dbs.py
2
dbs.py
@@ -693,7 +693,7 @@ def get_database_stats(sqlite):
|
||||
row = sqlite.execute('SELECT COUNT(*) FROM proxylist').fetchone()
|
||||
stats['proxy_count'] = row[0] if row else 0
|
||||
|
||||
row = sqlite.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0').fetchone()
|
||||
row = sqlite.execute('SELECT COUNT(*) FROM proxylist WHERE failed=0 AND tested IS NOT NULL').fetchone()
|
||||
stats['working_count'] = row[0] if row else 0
|
||||
|
||||
row = sqlite.execute('SELECT COUNT(*) FROM uris').fetchone()
|
||||
|
||||
25
fetch.py
25
fetch.py
@@ -5,7 +5,7 @@ import rocksock
|
||||
import network_stats
|
||||
from http2 import RsHttp, _parse_url
|
||||
from soup_parser import soupify
|
||||
from misc import _log
|
||||
from misc import _log, tor_proxy_url
|
||||
|
||||
config = None
|
||||
|
||||
@@ -14,14 +14,6 @@ _proxy_valid_cache = {}
|
||||
_proxy_valid_cache_max = 10000
|
||||
|
||||
|
||||
def tor_proxy_url(torhost):
|
||||
"""Generate Tor SOCKS5 proxy URL with random credentials for circuit isolation."""
|
||||
chars = string.ascii_lowercase + string.digits
|
||||
user = ''.join(random.choice(chars) for _ in range(8))
|
||||
passwd = ''.join(random.choice(chars) for _ in range(8))
|
||||
return 'socks5://%s:%s@%s' % (user, passwd, torhost)
|
||||
|
||||
|
||||
class FetchSession(object):
|
||||
"""Reusable fetch session with persistent Tor circuit.
|
||||
|
||||
@@ -927,18 +919,3 @@ def extract_proxies(content, proxydb=None, filter_known=True, proto=None):
|
||||
add_known_proxies([p])
|
||||
|
||||
return len(uniques), new
|
||||
|
||||
def extract_urls(content, urls = None, urignore=None):
|
||||
urls = [] if not urls else urls
|
||||
soup = soupify(content)
|
||||
for a in soup.body.find_all('a'):
|
||||
if not 'rel' in a.attrs or not 'noreferrer' in a.attrs['rel'] or a.attrs['href'] in urls: continue
|
||||
bad = False
|
||||
href = a.attrs['href']
|
||||
for i in urignore:
|
||||
if re.findall(i, href):
|
||||
bad = True
|
||||
break
|
||||
if not bad: urls.append(href)
|
||||
return urls
|
||||
|
||||
|
||||
Reference in New Issue
Block a user