soup_parser: remove dead gumbo code
This commit is contained in:
@@ -1,36 +1,8 @@
|
||||
from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
|
||||
import sys
|
||||
#import gumbo
|
||||
|
||||
parser = 'lxml'
|
||||
def soupify_bs4(html, nohtml=False):
|
||||
global parser
|
||||
parser = 'html.parser'
|
||||
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
||||
try:
|
||||
res = BeautifulSoup(htm, parser)
|
||||
except FeatureNotFound as e:
|
||||
parser = 'html.parser'
|
||||
res = BeautifulSoup(htm, parser)
|
||||
return res
|
||||
|
||||
def soupify_gumbo(html, nohtml=False):
|
||||
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
||||
try:
|
||||
soup = gumbo.soup_parse(htm)
|
||||
if not soup.body:
|
||||
print "AAAA"
|
||||
print html
|
||||
print "BBBB"
|
||||
print repr(soup)
|
||||
return soup
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout.write(html)
|
||||
raise
|
||||
|
||||
def soupify(html, nohtml=False):
|
||||
# return soupify_gumbo(html, nohtml)
|
||||
return soupify_bs4(html, nohtml)
|
||||
|
||||
|
||||
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
||||
try:
|
||||
return BeautifulSoup(htm, 'lxml')
|
||||
except FeatureNotFound:
|
||||
return BeautifulSoup(htm, 'html.parser')
|
||||
|
||||
Reference in New Issue
Block a user