feat: add 11 alert backends and fix PyPI/DEV.to search

Add Wikipedia, Stack Exchange, GitLab, npm, PyPI, Docker Hub,
arXiv, Lobsters, DEV.to, Medium, and Hugging Face backends to
the alert plugin (16 -> 27 total). Fix PyPI backend to use RSS
updates feed (web search now requires JS challenge). Fix DEV.to
to use public articles API (feed_content endpoint returns empty).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-16 20:07:01 +01:00
parent 34d5dd6f8d
commit 8e2b94fef0
5 changed files with 579 additions and 23 deletions

View File

@@ -64,6 +64,17 @@ _ODYSEE_API = "https://api.na-backend.odysee.com/api/v1/proxy"
_ARCHIVE_SEARCH_URL = "https://archive.org/advancedsearch.php"
_HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date"
_GITHUB_SEARCH_URL = "https://api.github.com/search/repositories"
_WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
_STACKEXCHANGE_URL = "https://api.stackexchange.com/2.3/search"
_GITLAB_SEARCH_URL = "https://gitlab.com/api/v4/projects"
_NPM_SEARCH_URL = "https://registry.npmjs.org/-/v1/search"
_PYPI_RSS_URL = "https://pypi.org/rss/updates.xml"
_DOCKERHUB_SEARCH_URL = "https://hub.docker.com/v2/search/repositories/"
_ARXIV_API = "https://export.arxiv.org/api/query"
_LOBSTERS_SEARCH_URL = "https://lobste.rs/search"
_DEVTO_API = "https://dev.to/api/articles"
_MEDIUM_FEED_URL = "https://medium.com/feed/tag"
_HUGGINGFACE_API = "https://huggingface.co/api/models"
# -- Module-level tracking ---------------------------------------------------
@@ -1125,6 +1136,503 @@ def _search_github(keyword: str) -> list[dict]:
return results
# -- Wikipedia search (blocking) --------------------------------------------
def _search_wikipedia(keyword: str) -> list[dict]:
"""Search Wikipedia articles via public API. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({
"action": "query", "list": "search", "srsearch": keyword,
"srlimit": "25", "format": "json",
})
url = f"{_WIKIPEDIA_API}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for item in (data.get("query") or {}).get("search") or []:
title = item.get("title", "")
pageid = str(item.get("pageid", ""))
if not pageid:
continue
date = _parse_date(item.get("timestamp", ""))
slug = title.replace(" ", "_")
results.append({
"id": pageid,
"title": title,
"url": f"https://en.wikipedia.org/wiki/{slug}",
"date": date,
"extra": "",
})
return results
# -- Stack Exchange search (blocking) ---------------------------------------
def _search_stackexchange(keyword: str) -> list[dict]:
"""Search Stack Overflow questions via public API. Blocking."""
import gzip
import io
import urllib.parse
params = urllib.parse.urlencode({
"order": "desc", "sort": "creation", "intitle": keyword,
"site": "stackoverflow", "pagesize": "25",
})
url = f"{_STACKEXCHANGE_URL}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
req.add_header("Accept-Encoding", "gzip")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
try:
raw = gzip.GzipFile(fileobj=io.BytesIO(raw)).read()
except OSError:
pass
data = json.loads(raw)
results: list[dict] = []
for item in data.get("items") or []:
qid = str(item.get("question_id", ""))
if not qid:
continue
title = _strip_html(item.get("title", ""))
link = item.get("link", "")
score = item.get("score", 0)
if score:
title += f" [{score}v]"
created = item.get("creation_date")
date = ""
if created:
try:
date = datetime.fromtimestamp(
int(created), tz=timezone.utc,
).strftime("%Y-%m-%d")
except (ValueError, OSError):
pass
results.append({
"id": qid, "title": title, "url": link,
"date": date, "extra": "",
})
return results
# -- GitLab search (blocking) ----------------------------------------------
def _search_gitlab(keyword: str) -> list[dict]:
"""Search GitLab projects via public API. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({
"search": keyword, "order_by": "updated_at",
"sort": "desc", "per_page": "25",
})
url = f"{_GITLAB_SEARCH_URL}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for repo in data if isinstance(data, list) else []:
rid = str(repo.get("id", ""))
if not rid:
continue
name = repo.get("path_with_namespace", "")
description = repo.get("description") or ""
web_url = repo.get("web_url", "")
stars = repo.get("star_count", 0)
title = name
if description:
title += f": {_truncate(description, 50)}"
if stars:
title += f" [{stars}*]"
date = _parse_date(repo.get("last_activity_at", ""))
results.append({
"id": rid, "title": title, "url": web_url,
"date": date, "extra": "",
})
return results
# -- npm search (blocking) -------------------------------------------------
def _search_npm(keyword: str) -> list[dict]:
"""Search npm packages via registry API. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({"text": keyword, "size": "25"})
url = f"{_NPM_SEARCH_URL}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for obj in data.get("objects") or []:
pkg = obj.get("package") or {}
name = pkg.get("name", "")
if not name:
continue
description = pkg.get("description") or ""
version = pkg.get("version", "")
links = pkg.get("links") or {}
npm_url = links.get("npm", f"https://www.npmjs.com/package/{name}")
title = f"{name}@{version}" if version else name
if description:
title += f": {_truncate(description, 50)}"
date = _parse_date(pkg.get("date", ""))
results.append({
"id": name, "title": title, "url": npm_url,
"date": date, "extra": "",
})
return results
# -- PyPI search (blocking) ------------------------------------------------
def _search_pypi(keyword: str) -> list[dict]:
"""Search PyPI recent updates via RSS feed, filtered by keyword. Blocking."""
import xml.etree.ElementTree as ET
req = urllib.request.Request(_PYPI_RSS_URL, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
root = ET.fromstring(raw)
kw_lower = keyword.lower()
results: list[dict] = []
for item in root.findall(".//item"):
title = (item.findtext("title") or "").strip()
link = (item.findtext("link") or "").strip()
desc = (item.findtext("description") or "").strip()
if not title or not link:
continue
if kw_lower not in title.lower() and kw_lower not in desc.lower():
continue
pkg_name = title.split()[0] if title else ""
display = title
if desc:
display += f": {_truncate(desc, 50)}"
results.append({
"id": pkg_name or link,
"title": display,
"url": link,
"date": "",
"extra": "",
})
return results
# -- Docker Hub search (blocking) ------------------------------------------
def _search_dockerhub(keyword: str) -> list[dict]:
"""Search Docker Hub repositories via public API. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({"query": keyword, "page_size": "25"})
url = f"{_DOCKERHUB_SEARCH_URL}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for item in data.get("results") or []:
name = item.get("repo_name", "")
if not name:
continue
description = item.get("short_description") or ""
stars = item.get("star_count", 0)
title = name
if description:
title += f": {_truncate(description, 50)}"
if stars:
title += f" [{stars}*]"
hub_url = (
f"https://hub.docker.com/r/{name}" if "/" in name
else f"https://hub.docker.com/_/{name}"
)
results.append({
"id": name, "title": title, "url": hub_url,
"date": "", "extra": "",
})
return results
# -- arXiv search (blocking) -----------------------------------------------
def _search_arxiv(keyword: str) -> list[dict]:
"""Search arXiv preprints via Atom API. Blocking."""
import urllib.parse
import xml.etree.ElementTree as ET
params = urllib.parse.urlencode({
"search_query": f"all:{keyword}",
"sortBy": "submittedDate", "sortOrder": "descending",
"max_results": "25",
})
url = f"{_ARXIV_API}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
ns = {"a": "http://www.w3.org/2005/Atom"}
root = ET.fromstring(raw)
results: list[dict] = []
for entry in root.findall("a:entry", ns):
entry_id = (entry.findtext("a:id", "", ns) or "").strip()
title = (entry.findtext("a:title", "", ns) or "").strip()
title = " ".join(title.split()) # collapse whitespace
published = entry.findtext("a:published", "", ns) or ""
link_url = ""
for link in entry.findall("a:link", ns):
if link.get("type") == "text/html":
link_url = link.get("href", "")
break
if not link_url:
link_url = entry_id
arxiv_id = entry_id.rsplit("/abs/", 1)[-1] if "/abs/" in entry_id else entry_id
date = _parse_date(published)
if title:
results.append({
"id": arxiv_id, "title": title, "url": link_url,
"date": date, "extra": "",
})
return results
# -- Lobsters search (blocking) --------------------------------------------
class _LobstersParser(HTMLParser):
"""Extract story links from Lobsters search HTML."""
def __init__(self):
super().__init__()
self.results: list[tuple[str, str]] = []
self._in_link = False
self._url = ""
self._title_parts: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag != "a":
return
attr_map = {k: (v or "") for k, v in attrs}
cls = attr_map.get("class", "")
if "u-url" in cls:
self._in_link = True
self._url = attr_map.get("href", "")
self._title_parts = []
def handle_data(self, data: str) -> None:
if self._in_link:
self._title_parts.append(data)
def handle_endtag(self, tag: str) -> None:
if tag == "a" and self._in_link:
self._in_link = False
title = "".join(self._title_parts).strip()
if self._url and title:
self.results.append((self._url, title))
def _search_lobsters(keyword: str) -> list[dict]:
"""Search Lobsters stories via HTML search page. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({
"q": keyword, "what": "stories", "order": "newest",
})
url = f"{_LOBSTERS_SEARCH_URL}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
html = raw.decode("utf-8", errors="replace")
parser = _LobstersParser()
parser.feed(html)
results: list[dict] = []
seen_urls: set[str] = set()
for item_url, title in parser.results:
if item_url in seen_urls:
continue
seen_urls.add(item_url)
results.append({
"id": item_url,
"title": title,
"url": item_url,
"date": "",
"extra": "",
})
return results
# -- DEV.to search (blocking) ----------------------------------------------
def _search_devto(keyword: str) -> list[dict]:
"""Search DEV.to articles via public articles API. Blocking."""
import urllib.parse
tag = re.sub(r"[^a-zA-Z0-9]", "", keyword).lower()
params = urllib.parse.urlencode({"per_page": "25", "tag": tag})
url = f"{_DEVTO_API}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
if not isinstance(data, list):
return []
results: list[dict] = []
for item in data:
article_id = str(item.get("id", ""))
if not article_id:
continue
title = item.get("title", "")
article_url = item.get("url", "")
user = item.get("user", {})
if isinstance(user, dict):
author = user.get("username", "")
else:
author = ""
if author:
title = f"{author}: {title}"
date = _parse_date(item.get("published_at", ""))
results.append({
"id": article_id, "title": title, "url": article_url,
"date": date, "extra": "",
})
return results
# -- Medium tag feed search (blocking) -------------------------------------
def _search_medium(keyword: str) -> list[dict]:
"""Search Medium via tag RSS feed. Blocking."""
import urllib.parse
import xml.etree.ElementTree as ET
tag = re.sub(r"[^a-zA-Z0-9-]", "-", keyword).lower().strip("-")
if not tag:
return []
url = f"{_MEDIUM_FEED_URL}/{urllib.parse.quote(tag, safe='')}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
root = ET.fromstring(raw)
results: list[dict] = []
for item in root.iter("item"):
title = (item.findtext("title") or "").strip()
link = (item.findtext("link") or "").strip()
if not link:
continue
guid = (item.findtext("guid") or link).strip()
creator = item.findtext("{http://purl.org/dc/elements/1.1/}creator") or ""
if creator:
title = f"{creator}: {title}"
pub_date = item.findtext("pubDate") or ""
date = _parse_date(pub_date)
if not date and pub_date:
from email.utils import parsedate_to_datetime
try:
dt = parsedate_to_datetime(pub_date)
date = dt.strftime("%Y-%m-%d")
except (ValueError, TypeError):
pass
results.append({
"id": guid, "title": title, "url": link,
"date": date, "extra": "",
})
return results
# -- Hugging Face search (blocking) ----------------------------------------
def _search_huggingface(keyword: str) -> list[dict]:
"""Search Hugging Face models via public API. Blocking."""
import urllib.parse
params = urllib.parse.urlencode({
"search": keyword, "sort": "lastModified",
"direction": "-1", "limit": "25",
})
url = f"{_HUGGINGFACE_API}?{params}"
req = urllib.request.Request(url, method="GET")
req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
raw = resp.read()
resp.close()
data = json.loads(raw)
results: list[dict] = []
for model in data if isinstance(data, list) else []:
model_id = model.get("modelId") or model.get("id", "")
if not model_id:
continue
downloads = model.get("downloads", 0)
likes = model.get("likes", 0)
title = model_id
if downloads:
title += f" [{downloads} dl]"
elif likes:
title += f" [{likes} likes]"
date = _parse_date(model.get("lastModified", ""))
results.append({
"id": model_id,
"title": title,
"url": f"https://huggingface.co/{model_id}",
"date": date,
"extra": "",
})
return results
# -- Backend registry -------------------------------------------------------
_BACKENDS: dict[str, callable] = {
@@ -1144,6 +1652,17 @@ _BACKENDS: dict[str, callable] = {
"ia": _search_archive,
"hn": _search_hackernews,
"gh": _search_github,
"wp": _search_wikipedia,
"se": _search_stackexchange,
"gl": _search_gitlab,
"nm": _search_npm,
"pp": _search_pypi,
"dh": _search_dockerhub,
"ax": _search_arxiv,
"lb": _search_lobsters,
"dv": _search_devto,
"md": _search_medium,
"hf": _search_huggingface,
}