soup_parser: remove dead gumbo code
This commit is contained in:
@@ -1,36 +1,8 @@
|
|||||||
from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
|
from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
|
||||||
import sys
|
|
||||||
#import gumbo
|
|
||||||
|
|
||||||
parser = 'lxml'
|
|
||||||
def soupify_bs4(html, nohtml=False):
|
|
||||||
global parser
|
|
||||||
parser = 'html.parser'
|
|
||||||
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
|
||||||
try:
|
|
||||||
res = BeautifulSoup(htm, parser)
|
|
||||||
except FeatureNotFound as e:
|
|
||||||
parser = 'html.parser'
|
|
||||||
res = BeautifulSoup(htm, parser)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def soupify_gumbo(html, nohtml=False):
|
|
||||||
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
|
||||||
try:
|
|
||||||
soup = gumbo.soup_parse(htm)
|
|
||||||
if not soup.body:
|
|
||||||
print "AAAA"
|
|
||||||
print html
|
|
||||||
print "BBBB"
|
|
||||||
print repr(soup)
|
|
||||||
return soup
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
sys.stdout.write(html)
|
|
||||||
raise
|
|
||||||
|
|
||||||
def soupify(html, nohtml=False):
|
def soupify(html, nohtml=False):
|
||||||
# return soupify_gumbo(html, nohtml)
|
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
|
||||||
return soupify_bs4(html, nohtml)
|
try:
|
||||||
|
return BeautifulSoup(htm, 'lxml')
|
||||||
|
except FeatureNotFound:
|
||||||
|
return BeautifulSoup(htm, 'html.parser')
|
||||||
|
|||||||
Reference in New Issue
Block a user