diff --git a/docs/CHEATSHEET.md b/docs/CHEATSHEET.md index 7c75653..54549bc 100644 --- a/docs/CHEATSHEET.md +++ b/docs/CHEATSHEET.md @@ -345,11 +345,12 @@ No API credentials needed (uses public GQL endpoint). !alert history [n] # Show recent results (default 5) ``` -Searches keywords across YouTube (yt), Twitch (tw), SearXNG (sx), Reddit (rd), -and Mastodon/Fediverse (ft) simultaneously. Names: lowercase alphanumeric + hyphens, -1-20 chars. Keywords: 1-100 chars. Max 20 alerts/channel. Polls every 5min. -Format: `[name/yt] Title -- URL`, `[name/rd] Title -- URL`, etc. -No API credentials needed. Persists across restarts. History stored in `data/alert_history.db`. +Searches keywords across 7 backends: YouTube (yt), Twitch (tw), SearXNG (sx), +Reddit (rd), Mastodon (ft), DuckDuckGo (dg), Google News (gn). Names: lowercase +alphanumeric + hyphens, 1-20 chars. Keywords: 1-100 chars. Max 20 alerts/channel. +Polls every 5min. Format: `[name/yt] Title -- URL`, etc. No API credentials needed. +DuckDuckGo and Google News route through SOCKS5 proxy. Persists across restarts. +History stored in `data/alert_history.db`. ## SearX diff --git a/docs/USAGE.md b/docs/USAGE.md index 9c8aa79..ee70fb1 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -687,13 +687,15 @@ Platforms searched: - **SearXNG** (`sx`) -- Local SearXNG instance, searches general/news/videos/social media categories filtered to last 24h (no auth required) - **Reddit** (`rd`) -- JSON search API, sorted by new, past week (no auth required) - **Mastodon** (`ft`) -- Public hashtag timeline across 4 instances (no auth required) +- **DuckDuckGo** (`dg`) -- HTML lite search endpoint via SOCKS5 proxy (no auth required) +- **Google News** (`gn`) -- Public RSS feed via SOCKS5 proxy (no auth required) Polling and announcements: - Alerts are polled every 5 minutes by default - On `add`, existing results are recorded without announcing (prevents flood) -- New results announced as `[name/yt] Title -- URL`, `[name/tw] Title -- URL`, - `[name/sx] Title -- URL`, `[name/rd] Title -- URL`, or `[name/ft] Title -- URL` +- New results announced as `[name/] Title -- URL` where tag is `yt`, `tw`, + `sx`, `rd`, `ft`, `dg`, or `gn` - Titles are truncated to 80 characters - Each platform maintains its own seen list (capped at 200 per platform) - 5 consecutive errors doubles the poll interval (max 1 hour) diff --git a/plugins/alert.py b/plugins/alert.py index b134776..4140d59 100644 --- a/plugins/alert.py +++ b/plugins/alert.py @@ -40,6 +40,8 @@ _MASTODON_INSTANCES = [ "infosec.exchange", ] _MASTODON_TAG_TIMEOUT = 4 +_DDG_URL = "https://html.duckduckgo.com/html/" +_GOOGLE_NEWS_RSS = "https://news.google.com/rss/search" # -- Module-level tracking --------------------------------------------------- @@ -155,6 +157,37 @@ _OG_TIMEOUT = 10 _OG_MAX_BYTES = 64 * 1024 # Only read first 64 KB (OG tags are in ) +class _DDGParser(HTMLParser): + """Extract search results from DuckDuckGo HTML lite page.""" + + def __init__(self): + super().__init__() + self.results: list[tuple[str, str]] = [] # (url, title) + self._in_link = False + self._url = "" + self._title_parts: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag != "a": + return + attr_map = dict(attrs) + if "result__a" in (attr_map.get("class") or ""): + self._in_link = True + self._url = attr_map.get("href", "") + self._title_parts = [] + + def handle_data(self, data: str) -> None: + if self._in_link: + self._title_parts.append(data) + + def handle_endtag(self, tag: str) -> None: + if tag == "a" and self._in_link: + self._in_link = False + title = "".join(self._title_parts).strip() + if self._url and title: + self.results.append((self._url, title)) + + def _parse_date(raw: str) -> str: """Try to extract a YYYY-MM-DD date from a raw date string.""" m = re.search(r"\d{4}-\d{2}-\d{2}", raw) @@ -490,6 +523,104 @@ def _search_mastodon(keyword: str) -> list[dict]: return results +# -- DuckDuckGo search (blocking) ------------------------------------------- + +def _resolve_ddg_url(raw_url: str) -> str: + """Resolve DuckDuckGo redirect URLs to actual target URLs.""" + import urllib.parse + + if "duckduckgo.com/l/" in raw_url: + parsed = urllib.parse.urlparse(raw_url) + params = urllib.parse.parse_qs(parsed.query) + uddg = params.get("uddg", []) + if uddg: + return uddg[0] + # Strip leading // scheme-relative URLs + if raw_url.startswith("//"): + return "https:" + raw_url + return raw_url + + +def _search_duckduckgo(keyword: str) -> list[dict]: + """Search DuckDuckGo via HTML lite endpoint. Blocking.""" + import urllib.parse + + body = urllib.parse.urlencode({"q": keyword}).encode() + + req = urllib.request.Request(_DDG_URL, data=body, method="POST") + req.add_header("Content-Type", "application/x-www-form-urlencoded") + req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)") + + resp = _urlopen(req, timeout=_FETCH_TIMEOUT) + raw = resp.read() + resp.close() + + html = raw.decode("utf-8", errors="replace") + parser = _DDGParser() + parser.feed(html) + + results: list[dict] = [] + seen_urls: set[str] = set() + for raw_url, title in parser.results: + url = _resolve_ddg_url(raw_url) + if not url or url in seen_urls: + continue + seen_urls.add(url) + results.append({ + "id": url, + "title": title, + "url": url, + "date": "", + "extra": "", + }) + return results + + +# -- Google News search (blocking) ------------------------------------------ + +def _search_google_news(keyword: str) -> list[dict]: + """Search Google News via public RSS feed. Blocking.""" + import urllib.parse + import xml.etree.ElementTree as ET + from email.utils import parsedate_to_datetime + + params = urllib.parse.urlencode({ + "q": keyword, "hl": "en", "gl": "US", "ceid": "US:en", + }) + url = f"{_GOOGLE_NEWS_RSS}?{params}" + + req = urllib.request.Request(url, method="GET") + req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)") + + resp = _urlopen(req, timeout=_FETCH_TIMEOUT) + raw = resp.read() + resp.close() + + root = ET.fromstring(raw) + results: list[dict] = [] + for item in root.iter("item"): + title = (item.findtext("title") or "").strip() + link = (item.findtext("link") or "").strip() + if not link: + continue + pub_date = item.findtext("pubDate") or "" + date = "" + if pub_date: + try: + dt = parsedate_to_datetime(pub_date) + date = dt.strftime("%Y-%m-%d") + except (ValueError, TypeError): + date = _parse_date(pub_date) + results.append({ + "id": link, + "title": title, + "url": link, + "date": date, + "extra": "", + }) + return results + + # -- Backend registry ------------------------------------------------------- _BACKENDS: dict[str, callable] = { @@ -498,6 +629,8 @@ _BACKENDS: dict[str, callable] = { "sx": _search_searx, "rd": _search_reddit, "ft": _search_mastodon, + "dg": _search_duckduckgo, + "gn": _search_google_news, }