Files
derp/plugins/archive.py
user e3bb793574 feat: add canary, tcping, archive, resolve plugins
canary: generate realistic fake credentials (token/aws/basic) for
planting as canary tripwires. Per-channel state persistence.

tcping: TCP connect latency probe through SOCKS5 proxy with
min/avg/max reporting. Proxy-compatible alternative to traceroute.

archive: save URLs to Wayback Machine via Save Page Now API,
routed through SOCKS5 proxy.

resolve: bulk DNS resolution (up to 10 hosts) via TCP DNS through
SOCKS5 proxy with concurrent asyncio.gather.

83 new tests (1010 total), docs updated.
2026-02-20 19:38:10 +01:00

106 lines
3.4 KiB
Python

"""Plugin: Wayback Machine Save Page Now (SOCKS5-proxied)."""
from __future__ import annotations
import asyncio
import logging
import urllib.error
import urllib.request
from derp.http import urlopen as _urlopen
from derp.plugin import command
log = logging.getLogger(__name__)
_SAVE_URL = "https://web.archive.org/save/"
_TIMEOUT = 30
_USER_AGENT = "derp/1.0"
def _save_page(url: str) -> dict:
"""Blocking POST to Save Page Now. Returns result dict."""
target = f"{_SAVE_URL}{url}"
req = urllib.request.Request(
target,
headers={"User-Agent": _USER_AGENT},
)
try:
resp = _urlopen(req, timeout=_TIMEOUT)
# The save endpoint returns a redirect to the archived page.
# With urllib3 pooled requests, redirects are followed automatically.
final_url = getattr(resp, "geturl", lambda: None)()
headers = resp.headers if hasattr(resp, "headers") else {}
# Check for Content-Location or Link header with archived URL
content_location = None
if hasattr(headers, "get"):
content_location = headers.get("Content-Location", "")
link = headers.get("Link", "")
else:
content_location = ""
link = ""
resp.read()
# Try Content-Location first (most reliable)
if content_location and "/web/" in content_location:
if content_location.startswith("/"):
return {"url": f"https://web.archive.org{content_location}"}
return {"url": content_location}
# Try final URL after redirects
if final_url and "/web/" in final_url:
return {"url": final_url}
# Try Link header
if link and "/web/" in link:
# Extract URL from Link header: <url>; rel="memento"
for part in link.split(","):
part = part.strip()
if "/web/" in part and "<" in part:
extracted = part.split("<", 1)[1].split(">", 1)[0]
return {"url": extracted}
# If we got a 200 but no archive URL, report success without link
return {"url": f"https://web.archive.org/web/*/{url}"}
except urllib.error.HTTPError as exc:
if exc.code == 429:
return {"error": "rate limited -- try again later"}
if exc.code == 523:
return {"error": "origin unreachable"}
return {"error": f"HTTP {exc.code}"}
except (TimeoutError, OSError) as exc:
return {"error": f"timeout: {exc}"}
except Exception as exc:
return {"error": str(exc)[:100]}
@command("archive", help="Save to Wayback Machine: !archive <url>")
async def cmd_archive(bot, message):
"""Save a URL to the Wayback Machine via Save Page Now.
Usage:
!archive https://example.com/page
"""
parts = message.text.split(None, 1)
if len(parts) < 2:
await bot.reply(message, "Usage: !archive <url>")
return
url = parts[1].strip()
if not url.startswith(("http://", "https://")):
await bot.reply(message, "URL must start with http:// or https://")
return
await bot.reply(message, f"Archiving {url}...")
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, _save_page, url)
if "error" in result:
await bot.reply(message, f"Archive failed: {result['error']}")
else:
await bot.reply(message, f"Archived: {result['url']}")