soup_parser: remove dead gumbo code

This commit is contained in:
Username
2025-12-20 16:46:08 +01:00
parent e7a8ff7df7
commit dc545494b9

View File

@@ -1,36 +1,8 @@
from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound from bs4 import BeautifulSoup, SoupStrainer, FeatureNotFound
import sys
#import gumbo
parser = 'lxml'
def soupify_bs4(html, nohtml=False):
global parser
parser = 'html.parser'
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
try:
res = BeautifulSoup(htm, parser)
except FeatureNotFound as e:
parser = 'html.parser'
res = BeautifulSoup(htm, parser)
return res
def soupify_gumbo(html, nohtml=False):
htm = html if nohtml else '<html><body>%s</body></html>'%(html)
try:
soup = gumbo.soup_parse(htm)
if not soup.body:
print "AAAA"
print html
print "BBBB"
print repr(soup)
return soup
except Exception as e:
sys.stdout.write(html)
raise
def soupify(html, nohtml=False): def soupify(html, nohtml=False):
# return soupify_gumbo(html, nohtml) htm = html if nohtml else '<html><body>%s</body></html>'%(html)
return soupify_bs4(html, nohtml) try:
return BeautifulSoup(htm, 'lxml')
except FeatureNotFound:
return BeautifulSoup(htm, 'html.parser')