derp/plugins/archive.py

"""Plugin: Wayback Machine Save Page Now (SOCKS5-proxied)."""

from __future__ import annotations

import asyncio
import logging
import urllib.error
import urllib.request

from derp.http import urlopen as _urlopen
from derp.plugin import command

log = logging.getLogger(__name__)

_SAVE_URL = "https://web.archive.org/save/"
_TIMEOUT = 30
_USER_AGENT = "derp/1.0"


def _save_page(url: str) -> dict:
    """Blocking POST to Save Page Now. Returns result dict."""
    target = f"{_SAVE_URL}{url}"
    req = urllib.request.Request(
        target,
        headers={"User-Agent": _USER_AGENT},
    )

    try:
        resp = _urlopen(req, timeout=_TIMEOUT)
        # The save endpoint returns a redirect to the archived page.
        # With urllib3 pooled requests, redirects are followed automatically.
        final_url = getattr(resp, "geturl", lambda: None)()
        headers = resp.headers if hasattr(resp, "headers") else {}

        # Check for Content-Location or Link header with archived URL
        content_location = None
        if hasattr(headers, "get"):
            content_location = headers.get("Content-Location", "")
            link = headers.get("Link", "")
        else:
            content_location = ""
            link = ""

        resp.read()

        # Try Content-Location first (most reliable)
        if content_location and "/web/" in content_location:
            if content_location.startswith("/"):
                return {"url": f"https://web.archive.org{content_location}"}
            return {"url": content_location}

        # Try final URL after redirects
        if final_url and "/web/" in final_url:
            return {"url": final_url}

        # Try Link header
        if link and "/web/" in link:
            # Extract URL from Link header: <url>; rel="memento"
            for part in link.split(","):
                part = part.strip()
                if "/web/" in part and "<" in part:
                    extracted = part.split("<", 1)[1].split(">", 1)[0]
                    return {"url": extracted}

        # If we got a 200 but no archive URL, report success without link
        return {"url": f"https://web.archive.org/web/*/{url}"}

    except urllib.error.HTTPError as exc:
        if exc.code == 429:
            return {"error": "rate limited -- try again later"}
        if exc.code == 523:
            return {"error": "origin unreachable"}
        return {"error": f"HTTP {exc.code}"}
    except (TimeoutError, OSError) as exc:
        return {"error": f"timeout: {exc}"}
    except Exception as exc:
        return {"error": str(exc)[:100]}


@command("archive", help="Save to Wayback Machine: !archive <url>")
async def cmd_archive(bot, message):
    """Save a URL to the Wayback Machine via Save Page Now.

    Usage:
        !archive https://example.com/page
    """
    parts = message.text.split(None, 1)
    if len(parts) < 2:
        await bot.reply(message, "Usage: !archive <url>")
        return

    url = parts[1].strip()
    if not url.startswith(("http://", "https://")):
        await bot.reply(message, "URL must start with http:// or https://")
        return

    await bot.reply(message, f"Archiving {url}...")

    loop = asyncio.get_running_loop()
    result = await loop.run_in_executor(None, _save_page, url)

    if "error" in result:
        await bot.reply(message, f"Archive failed: {result['error']}")
    else:
        await bot.reply(message, f"Archived: {result['url']}")