feat: add DuckDuckGo and Google News backends to alert plugin

DuckDuckGo (dg) searches via HTML lite endpoint with HTMLParser, resolves DDG redirect URLs to actual targets. Google News (gn) queries public RSS feed, parses RFC 822 dates. Both routed through SOCKS5 proxy via _urlopen. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:51:52 +01:00
parent e70c22a510
commit 80677343bf
3 changed files with 143 additions and 7 deletions
--- a/docs/CHEATSHEET.md
+++ b/docs/CHEATSHEET.md
@@ -345,11 +345,12 @@ No API credentials needed (uses public GQL endpoint).
 !alert history <name> [n]        # Show recent results (default 5)
 ```
-Searches keywords across YouTube (yt), Twitch (tw), SearXNG (sx), Reddit (rd),
+Searches keywords across 7 backends: YouTube (yt), Twitch (tw), SearXNG (sx),
-and Mastodon/Fediverse (ft) simultaneously. Names: lowercase alphanumeric + hyphens,
+Reddit (rd), Mastodon (ft), DuckDuckGo (dg), Google News (gn). Names: lowercase
-1-20 chars. Keywords: 1-100 chars. Max 20 alerts/channel. Polls every 5min.
+alphanumeric + hyphens, 1-20 chars. Keywords: 1-100 chars. Max 20 alerts/channel.
-Format: `[name/yt] Title -- URL`, `[name/rd] Title -- URL`, etc.
+Polls every 5min. Format: `[name/yt] Title -- URL`, etc. No API credentials needed.
-No API credentials needed. Persists across restarts. History stored in `data/alert_history.db`.
+DuckDuckGo and Google News route through SOCKS5 proxy. Persists across restarts.
 History stored in `data/alert_history.db`.
 ## SearX
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -687,13 +687,15 @@ Platforms searched:
 - **SearXNG** (`sx`) -- Local SearXNG instance, searches general/news/videos/social media categories filtered to last 24h (no auth required)
 - **Reddit** (`rd`) -- JSON search API, sorted by new, past week (no auth required)
 - **Mastodon** (`ft`) -- Public hashtag timeline across 4 instances (no auth required)
 - **DuckDuckGo** (`dg`) -- HTML lite search endpoint via SOCKS5 proxy (no auth required)
 - **Google News** (`gn`) -- Public RSS feed via SOCKS5 proxy (no auth required)
 Polling and announcements:
 - Alerts are polled every 5 minutes by default
 - On `add`, existing results are recorded without announcing (prevents flood)
- New results announced as `[name/yt] Title -- URL`, `[name/tw] Title -- URL`,
+- New results announced as `[name/<tag>] Title -- URL` where tag is `yt`, `tw`,
-  `[name/sx] Title -- URL`, `[name/rd] Title -- URL`, or `[name/ft] Title -- URL`
+  `sx`, `rd`, `ft`, `dg`, or `gn`
 - Titles are truncated to 80 characters
 - Each platform maintains its own seen list (capped at 200 per platform)
 - 5 consecutive errors doubles the poll interval (max 1 hour)
--- a/plugins/alert.py
+++ b/plugins/alert.py
@@ -40,6 +40,8 @@ _MASTODON_INSTANCES = [
    "infosec.exchange",
 ]
 _MASTODON_TAG_TIMEOUT = 4
 _DDG_URL = "https://html.duckduckgo.com/html/"
 _GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"
 # -- Module-level tracking ---------------------------------------------------
@@ -155,6 +157,37 @@ _OG_TIMEOUT = 10
 _OG_MAX_BYTES = 64 * 1024  # Only read first 64 KB (OG tags are in <head>)
 class _DDGParser(HTMLParser):
    """Extract search results from DuckDuckGo HTML lite page."""
    def __init__(self):
        super().__init__()
        self.results: list[tuple[str, str]] = []  # (url, title)
        self._in_link = False
        self._url = ""
        self._title_parts: list[str] = []
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag != "a":
            return
        attr_map = dict(attrs)
        if "result__a" in (attr_map.get("class") or ""):
            self._in_link = True
            self._url = attr_map.get("href", "")
            self._title_parts = []
    def handle_data(self, data: str) -> None:
        if self._in_link:
            self._title_parts.append(data)
    def handle_endtag(self, tag: str) -> None:
        if tag == "a" and self._in_link:
            self._in_link = False
            title = "".join(self._title_parts).strip()
            if self._url and title:
                self.results.append((self._url, title))
 def _parse_date(raw: str) -> str:
    """Try to extract a YYYY-MM-DD date from a raw date string."""
    m = re.search(r"\d{4}-\d{2}-\d{2}", raw)
@@ -490,6 +523,104 @@ def _search_mastodon(keyword: str) -> list[dict]:
    return results
 # -- DuckDuckGo search (blocking) -------------------------------------------
 def _resolve_ddg_url(raw_url: str) -> str:
    """Resolve DuckDuckGo redirect URLs to actual target URLs."""
    import urllib.parse
    if "duckduckgo.com/l/" in raw_url:
        parsed = urllib.parse.urlparse(raw_url)
        params = urllib.parse.parse_qs(parsed.query)
        uddg = params.get("uddg", [])
        if uddg:
            return uddg[0]
    # Strip leading // scheme-relative URLs
    if raw_url.startswith("//"):
        return "https:" + raw_url
    return raw_url
 def _search_duckduckgo(keyword: str) -> list[dict]:
    """Search DuckDuckGo via HTML lite endpoint. Blocking."""
    import urllib.parse
    body = urllib.parse.urlencode({"q": keyword}).encode()
    req = urllib.request.Request(_DDG_URL, data=body, method="POST")
    req.add_header("Content-Type", "application/x-www-form-urlencoded")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    html = raw.decode("utf-8", errors="replace")
    parser = _DDGParser()
    parser.feed(html)
    results: list[dict] = []
    seen_urls: set[str] = set()
    for raw_url, title in parser.results:
        url = _resolve_ddg_url(raw_url)
        if not url or url in seen_urls:
            continue
        seen_urls.add(url)
        results.append({
            "id": url,
            "title": title,
            "url": url,
            "date": "",
            "extra": "",
        })
    return results
 # -- Google News search (blocking) ------------------------------------------
 def _search_google_news(keyword: str) -> list[dict]:
    """Search Google News via public RSS feed. Blocking."""
    import urllib.parse
    import xml.etree.ElementTree as ET
    from email.utils import parsedate_to_datetime
    params = urllib.parse.urlencode({
        "q": keyword, "hl": "en", "gl": "US", "ceid": "US:en",
    })
    url = f"{_GOOGLE_NEWS_RSS}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    root = ET.fromstring(raw)
    results: list[dict] = []
    for item in root.iter("item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        if not link:
            continue
        pub_date = item.findtext("pubDate") or ""
        date = ""
        if pub_date:
            try:
                dt = parsedate_to_datetime(pub_date)
                date = dt.strftime("%Y-%m-%d")
            except (ValueError, TypeError):
                date = _parse_date(pub_date)
        results.append({
            "id": link,
            "title": title,
            "url": link,
            "date": date,
            "extra": "",
        })
    return results
 # -- Backend registry -------------------------------------------------------
 _BACKENDS: dict[str, callable] = {
@@ -498,6 +629,8 @@ _BACKENDS: dict[str, callable] = {
    "sx": _search_searx,
    "rd": _search_reddit,
    "ft": _search_mastodon,
    "dg": _search_duckduckgo,
    "gn": _search_google_news,
 }