From c241f1a766718eab4340a8f92bdb9ce1a39a5845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Serneels?= Date: Wed, 1 May 2019 23:19:50 +0200 Subject: [PATCH] make use of dbs.insert_urls() --- ppf.py | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/ppf.py b/ppf.py index b9d1be3..306315f 100755 --- a/ppf.py +++ b/ppf.py @@ -15,13 +15,8 @@ config = Config() def import_from_file(fn, sqlite): with open(fn, 'r') as f: - for u in f.read().split('\n'): - if not len(u): continue - exists = [ i[0] for i in sqlite.execute('SELECT url FROM uris WHERE url=?',(u,)).fetchall() ] - if exists: continue - print('adding "%s"' % u) - sqlite.execute('INSERT INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', (int(time.time()),u,0,0,0,0,0)) - sqlite.commit() + urls = [ url for url in f.read().split('\n') ] + dbs.insert_urls(urls, 'import.txt', urldb) def get_content_type(url): @@ -97,22 +92,8 @@ def extract_urls(html, url): item = '%s://%s%s' % (proto,domain,item) if not item in urls: urls.append(item) - if len(urls) < 200: continue - insert_if_not_exists(urls) - urls = [] - - if len(urls): insert_if_not_exists(urls) - -def insert_if_not_exists(urls): - mytime = int(time.time()) - query = 'SELECT url FROM uris WHERE %s' % ' OR '.join( [ 'url=?' for u in urls ] ) - known = [ item[0] for item in urldb.execute(query, urls) ] - args = [ [mytime, u, (mytime - 3600), 1, 0,0,0] for u in urls if not u in known ] - if len(args): - print('new items: %s' % args) - urldb.executemany('INSERT OR IGNORE INTO uris (added,url,check_time,error,stale_count,proxies_added,retrievals) VALUES (?,?,?,?,?,?,?)', args) - urldb.commit() + if len(urls): dbs.insert_urls(urls, url, urldb) #insert_if_not_exists(urls) def import_proxies_from_file(proxydb, fn): content = open(fn, 'r').read()