#!/usr/bin/env python3 import sys import json import re import urllib.request from html.parser import HTMLParser args = json.loads(sys.stdin.read()) url = args.get("url", "") class TextExtractor(HTMLParser): def __init__(self): super().__init__() self.text = [] self._skip = False def handle_starttag(self, tag, attrs): if tag in ("script", "style", "noscript"): self._skip = True def handle_endtag(self, tag): if tag in ("script", "style", "noscript"): self._skip = False if tag in ("p", "br", "div", "h1", "h2", "h3", "h4", "li", "tr"): self.text.append("\n") def handle_data(self, data): if not self._skip: self.text.append(data) try: req = urllib.request.Request(url, headers={"User-Agent": "fireclaw-agent"}) with urllib.request.urlopen(req, timeout=15) as resp: content_type = resp.headers.get("Content-Type", "") raw = resp.read(50_000).decode("utf-8", errors="replace") if "html" in content_type: parser = TextExtractor() parser.feed(raw) text = "".join(parser.text) else: text = raw text = re.sub(r"\n{3,}", "\n\n", text).strip() print(text or "[empty page]") except Exception as e: print(f"[fetch error: {e}]")