diff --git a/soup_parser.py b/soup_parser.py index 86ee1a4..ee93d2b 100644 --- a/soup_parser.py +++ b/soup_parser.py @@ -1,36 +1,8 @@ from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound -import sys -#import gumbo - -parser = 'lxml' -def soupify_bs4(html, nohtml=False): - global parser - parser = 'html.parser' - htm = html if nohtml else '%s'%(html) - try: - res = BeautifulSoup(htm, parser) - except FeatureNotFound as e: - parser = 'html.parser' - res = BeautifulSoup(htm, parser) - return res - -def soupify_gumbo(html, nohtml=False): - htm = html if nohtml else '%s'%(html) - try: - soup = gumbo.soup_parse(htm) - if not soup.body: - print "AAAA" - print html - print "BBBB" - print repr(soup) - return soup - - except Exception as e: - sys.stdout.write(html) - raise def soupify(html, nohtml=False): -# return soupify_gumbo(html, nohtml) - return soupify_bs4(html, nohtml) - - + htm = html if nohtml else '%s'%(html) + try: + return BeautifulSoup(htm, 'lxml') + except FeatureNotFound: + return BeautifulSoup(htm, 'html.parser')