fireclaw/skills/fetch_url/run.py

#!/usr/bin/env python3
import sys
import json
import re
import urllib.request
from html.parser import HTMLParser

args = json.loads(sys.stdin.read())
url = args.get("url", "")


class TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text = []
        self._skip = False

    def handle_starttag(self, tag, attrs):
        if tag in ("script", "style", "noscript"):
            self._skip = True

    def handle_endtag(self, tag):
        if tag in ("script", "style", "noscript"):
            self._skip = False
        if tag in ("p", "br", "div", "h1", "h2", "h3", "h4", "li", "tr"):
            self.text.append("\n")

    def handle_data(self, data):
        if not self._skip:
            self.text.append(data)


try:
    req = urllib.request.Request(url, headers={"User-Agent": "fireclaw-agent"})
    with urllib.request.urlopen(req, timeout=15) as resp:
        content_type = resp.headers.get("Content-Type", "")
        raw = resp.read(50_000).decode("utf-8", errors="replace")

    if "html" in content_type:
        parser = TextExtractor()
        parser.feed(raw)
        text = "".join(parser.text)
    else:
        text = raw

    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    if len(text) > 3000:
        text = text[:3000] + "\n[truncated]"
    print(text or "[empty page]")
except Exception as e:
    print(f"[fetch error: {e}]")