52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
#!/usr/bin/env python3
|
|
import sys
|
|
import json
|
|
import re
|
|
import urllib.request
|
|
from html.parser import HTMLParser
|
|
|
|
args = json.loads(sys.stdin.read())
|
|
url = args.get("url", "")
|
|
|
|
|
|
class TextExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.text = []
|
|
self._skip = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in ("script", "style", "noscript"):
|
|
self._skip = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ("script", "style", "noscript"):
|
|
self._skip = False
|
|
if tag in ("p", "br", "div", "h1", "h2", "h3", "h4", "li", "tr"):
|
|
self.text.append("\n")
|
|
|
|
def handle_data(self, data):
|
|
if not self._skip:
|
|
self.text.append(data)
|
|
|
|
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "fireclaw-agent"})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
content_type = resp.headers.get("Content-Type", "")
|
|
raw = resp.read(50_000).decode("utf-8", errors="replace")
|
|
|
|
if "html" in content_type:
|
|
parser = TextExtractor()
|
|
parser.feed(raw)
|
|
text = "".join(parser.text)
|
|
else:
|
|
text = raw
|
|
|
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
|
if len(text) > 3000:
|
|
text = text[:3000] + "\n[truncated]"
|
|
print(text or "[empty page]")
|
|
except Exception as e:
|
|
print(f"[fetch error: {e}]")
|