feat: add DuckDuckGo and Google News backends to alert plugin

DuckDuckGo (dg) searches via HTML lite endpoint with HTMLParser, resolves DDG redirect URLs to actual targets. Google News (gn) queries public RSS feed, parses RFC 822 dates. Both routed through SOCKS5 proxy via _urlopen. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:51:52 +01:00
parent e70c22a510
commit 80677343bf
3 changed files with 143 additions and 7 deletions
--- a/plugins/alert.py
+++ b/plugins/alert.py
@@ -40,6 +40,8 @@ _MASTODON_INSTANCES = [
    "infosec.exchange",
 ]
 _MASTODON_TAG_TIMEOUT = 4
+_DDG_URL = "https://html.duckduckgo.com/html/"
+_GOOGLE_NEWS_RSS = "https://news.google.com/rss/search"

 # -- Module-level tracking ---------------------------------------------------

@@ -155,6 +157,37 @@ _OG_TIMEOUT = 10
 _OG_MAX_BYTES = 64 * 1024  # Only read first 64 KB (OG tags are in <head>)


+class _DDGParser(HTMLParser):
+    """Extract search results from DuckDuckGo HTML lite page."""
+
+    def __init__(self):
+        super().__init__()
+        self.results: list[tuple[str, str]] = []  # (url, title)
+        self._in_link = False
+        self._url = ""
+        self._title_parts: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag != "a":
+            return
+        attr_map = dict(attrs)
+        if "result__a" in (attr_map.get("class") or ""):
+            self._in_link = True
+            self._url = attr_map.get("href", "")
+            self._title_parts = []
+
+    def handle_data(self, data: str) -> None:
+        if self._in_link:
+            self._title_parts.append(data)
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag == "a" and self._in_link:
+            self._in_link = False
+            title = "".join(self._title_parts).strip()
+            if self._url and title:
+                self.results.append((self._url, title))
+
+
 def _parse_date(raw: str) -> str:
    """Try to extract a YYYY-MM-DD date from a raw date string."""
    m = re.search(r"\d{4}-\d{2}-\d{2}", raw)
@@ -490,6 +523,104 @@ def _search_mastodon(keyword: str) -> list[dict]:
    return results


+# -- DuckDuckGo search (blocking) -------------------------------------------
+
+def _resolve_ddg_url(raw_url: str) -> str:
+    """Resolve DuckDuckGo redirect URLs to actual target URLs."""
+    import urllib.parse
+
+    if "duckduckgo.com/l/" in raw_url:
+        parsed = urllib.parse.urlparse(raw_url)
+        params = urllib.parse.parse_qs(parsed.query)
+        uddg = params.get("uddg", [])
+        if uddg:
+            return uddg[0]
+    # Strip leading // scheme-relative URLs
+    if raw_url.startswith("//"):
+        return "https:" + raw_url
+    return raw_url
+
+
+def _search_duckduckgo(keyword: str) -> list[dict]:
+    """Search DuckDuckGo via HTML lite endpoint. Blocking."""
+    import urllib.parse
+
+    body = urllib.parse.urlencode({"q": keyword}).encode()
+
+    req = urllib.request.Request(_DDG_URL, data=body, method="POST")
+    req.add_header("Content-Type", "application/x-www-form-urlencoded")
+    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+
+    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+    raw = resp.read()
+    resp.close()
+
+    html = raw.decode("utf-8", errors="replace")
+    parser = _DDGParser()
+    parser.feed(html)
+
+    results: list[dict] = []
+    seen_urls: set[str] = set()
+    for raw_url, title in parser.results:
+        url = _resolve_ddg_url(raw_url)
+        if not url or url in seen_urls:
+            continue
+        seen_urls.add(url)
+        results.append({
+            "id": url,
+            "title": title,
+            "url": url,
+            "date": "",
+            "extra": "",
+        })
+    return results
+
+
+# -- Google News search (blocking) ------------------------------------------
+
+def _search_google_news(keyword: str) -> list[dict]:
+    """Search Google News via public RSS feed. Blocking."""
+    import urllib.parse
+    import xml.etree.ElementTree as ET
+    from email.utils import parsedate_to_datetime
+
+    params = urllib.parse.urlencode({
+        "q": keyword, "hl": "en", "gl": "US", "ceid": "US:en",
+    })
+    url = f"{_GOOGLE_NEWS_RSS}?{params}"
+
+    req = urllib.request.Request(url, method="GET")
+    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
+
+    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
+    raw = resp.read()
+    resp.close()
+
+    root = ET.fromstring(raw)
+    results: list[dict] = []
+    for item in root.iter("item"):
+        title = (item.findtext("title") or "").strip()
+        link = (item.findtext("link") or "").strip()
+        if not link:
+            continue
+        pub_date = item.findtext("pubDate") or ""
+        date = ""
+        if pub_date:
+            try:
+                dt = parsedate_to_datetime(pub_date)
+                date = dt.strftime("%Y-%m-%d")
+            except (ValueError, TypeError):
+                date = _parse_date(pub_date)
+        results.append({
+            "id": link,
+            "title": title,
+            "url": link,
+            "date": date,
+            "extra": "",
+        })
+    return results
+
+
 # -- Backend registry -------------------------------------------------------

 _BACKENDS: dict[str, callable] = {
@@ -498,6 +629,8 @@ _BACKENDS: dict[str, callable] = {
    "sx": _search_searx,
    "rd": _search_reddit,
    "ft": _search_mastodon,
+    "dg": _search_duckduckgo,
+    "gn": _search_google_news,
 }