fix imports

+ move soup_parser.py to includes
+ reorganize imports
+ remove double
This commit is contained in:
mickael
2019-01-04 21:18:36 +00:00
parent 4926d6c383
commit 4a60a23df3
2 changed files with 3 additions and 7 deletions

36
includes/soup_parser.py Normal file
View File

@@ -0,0 +1,36 @@
from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
import sys
#import gumbo
parser = 'lxml'
def soupify_bs4(html, nohtml=False):
global parser
parser = 'html.parser'
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
try:
res = BeautifulSoup(htm, parser)
except FeatureNotFound as e:
parser = 'html.parser'
res = BeautifulSoup(htm, parser)
return res
def soupify_gumbo(html, nohtml=False):
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
try:
soup = gumbo.soup_parse(htm)
if not soup.body:
print "AAAA"
print html
print "BBBB"
print repr(soup)
return soup
except Exception as e:
sys.stdout.write(html)
raise
def soupify(html, nohtml=False):
# return soupify_gumbo(html, nohtml)
return soupify_bs4(html, nohtml)