2026-02-16 20:07:01 +01:00
parent 34d5dd6f8d
commit 8e2b94fef0
5 changed files with 579 additions and 23 deletions
							
							
								
							
							
						
@@ -64,6 +64,17 @@ _ODYSEE_API = "https://api.na-backend.odysee.com/api/v1/proxy"
_ARCHIVE_SEARCH_URL = "https://archive.org/advancedsearch.php"
_HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search_by_date"
_GITHUB_SEARCH_URL = "https://api.github.com/search/repositories"
_WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"
_STACKEXCHANGE_URL = "https://api.stackexchange.com/2.3/search"
_GITLAB_SEARCH_URL = "https://gitlab.com/api/v4/projects"
_NPM_SEARCH_URL = "https://registry.npmjs.org/-/v1/search"
_PYPI_RSS_URL = "https://pypi.org/rss/updates.xml"
_DOCKERHUB_SEARCH_URL = "https://hub.docker.com/v2/search/repositories/"
_ARXIV_API = "https://export.arxiv.org/api/query"
_LOBSTERS_SEARCH_URL = "https://lobste.rs/search"
_DEVTO_API = "https://dev.to/api/articles"
_MEDIUM_FEED_URL = "https://medium.com/feed/tag"
_HUGGINGFACE_API = "https://huggingface.co/api/models"
# -- Module-level tracking ---------------------------------------------------
							
								
							
							
								
							
							
						
@@ -1125,6 +1136,503 @@ def _search_github(keyword: str) -> list[dict]:
    return results
# -- Wikipedia search (blocking) --------------------------------------------
def _search_wikipedia(keyword: str) -> list[dict]:
    """Search Wikipedia articles via public API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({
        "action": "query", "list": "search", "srsearch": keyword,
        "srlimit": "25", "format": "json",
    })
    url = f"{_WIKIPEDIA_API}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for item in (data.get("query") or {}).get("search") or []:
        title = item.get("title", "")
        pageid = str(item.get("pageid", ""))
        if not pageid:
            continue
        date = _parse_date(item.get("timestamp", ""))
        slug = title.replace(" ", "_")
        results.append({
            "id": pageid,
            "title": title,
            "url": f"https://en.wikipedia.org/wiki/{slug}",
            "date": date,
            "extra": "",
        })
    return results
# -- Stack Exchange search (blocking) ---------------------------------------
def _search_stackexchange(keyword: str) -> list[dict]:
    """Search Stack Overflow questions via public API. Blocking."""
    import gzip
    import io
    import urllib.parse
    params = urllib.parse.urlencode({
        "order": "desc", "sort": "creation", "intitle": keyword,
        "site": "stackoverflow", "pagesize": "25",
    })
    url = f"{_STACKEXCHANGE_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    req.add_header("Accept-Encoding", "gzip")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    try:
        raw = gzip.GzipFile(fileobj=io.BytesIO(raw)).read()
    except OSError:
        pass
    data = json.loads(raw)
    results: list[dict] = []
    for item in data.get("items") or []:
        qid = str(item.get("question_id", ""))
        if not qid:
            continue
        title = _strip_html(item.get("title", ""))
        link = item.get("link", "")
        score = item.get("score", 0)
        if score:
            title += f" [{score}v]"
        created = item.get("creation_date")
        date = ""
        if created:
            try:
                date = datetime.fromtimestamp(
                    int(created), tz=timezone.utc,
                ).strftime("%Y-%m-%d")
            except (ValueError, OSError):
                pass
        results.append({
            "id": qid, "title": title, "url": link,
            "date": date, "extra": "",
        })
    return results
# -- GitLab search (blocking) ----------------------------------------------
def _search_gitlab(keyword: str) -> list[dict]:
    """Search GitLab projects via public API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({
        "search": keyword, "order_by": "updated_at",
        "sort": "desc", "per_page": "25",
    })
    url = f"{_GITLAB_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for repo in data if isinstance(data, list) else []:
        rid = str(repo.get("id", ""))
        if not rid:
            continue
        name = repo.get("path_with_namespace", "")
        description = repo.get("description") or ""
        web_url = repo.get("web_url", "")
        stars = repo.get("star_count", 0)
        title = name
        if description:
            title += f": {_truncate(description, 50)}"
        if stars:
            title += f" [{stars}*]"
        date = _parse_date(repo.get("last_activity_at", ""))
        results.append({
            "id": rid, "title": title, "url": web_url,
            "date": date, "extra": "",
        })
    return results
# -- npm search (blocking) -------------------------------------------------
def _search_npm(keyword: str) -> list[dict]:
    """Search npm packages via registry API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({"text": keyword, "size": "25"})
    url = f"{_NPM_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for obj in data.get("objects") or []:
        pkg = obj.get("package") or {}
        name = pkg.get("name", "")
        if not name:
            continue
        description = pkg.get("description") or ""
        version = pkg.get("version", "")
        links = pkg.get("links") or {}
        npm_url = links.get("npm", f"https://www.npmjs.com/package/{name}")
        title = f"{name}@{version}" if version else name
        if description:
            title += f": {_truncate(description, 50)}"
        date = _parse_date(pkg.get("date", ""))
        results.append({
            "id": name, "title": title, "url": npm_url,
            "date": date, "extra": "",
        })
    return results
# -- PyPI search (blocking) ------------------------------------------------
def _search_pypi(keyword: str) -> list[dict]:
    """Search PyPI recent updates via RSS feed, filtered by keyword. Blocking."""
    import xml.etree.ElementTree as ET
    req = urllib.request.Request(_PYPI_RSS_URL, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    root = ET.fromstring(raw)
    kw_lower = keyword.lower()
    results: list[dict] = []
    for item in root.findall(".//item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        desc = (item.findtext("description") or "").strip()
        if not title or not link:
            continue
        if kw_lower not in title.lower() and kw_lower not in desc.lower():
            continue
        pkg_name = title.split()[0] if title else ""
        display = title
        if desc:
            display += f": {_truncate(desc, 50)}"
        results.append({
            "id": pkg_name or link,
            "title": display,
            "url": link,
            "date": "",
            "extra": "",
        })
    return results
# -- Docker Hub search (blocking) ------------------------------------------
def _search_dockerhub(keyword: str) -> list[dict]:
    """Search Docker Hub repositories via public API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({"query": keyword, "page_size": "25"})
    url = f"{_DOCKERHUB_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for item in data.get("results") or []:
        name = item.get("repo_name", "")
        if not name:
            continue
        description = item.get("short_description") or ""
        stars = item.get("star_count", 0)
        title = name
        if description:
            title += f": {_truncate(description, 50)}"
        if stars:
            title += f" [{stars}*]"
        hub_url = (
            f"https://hub.docker.com/r/{name}" if "/" in name
            else f"https://hub.docker.com/_/{name}"
        )
        results.append({
            "id": name, "title": title, "url": hub_url,
            "date": "", "extra": "",
        })
    return results
# -- arXiv search (blocking) -----------------------------------------------
def _search_arxiv(keyword: str) -> list[dict]:
    """Search arXiv preprints via Atom API. Blocking."""
    import urllib.parse
    import xml.etree.ElementTree as ET
    params = urllib.parse.urlencode({
        "search_query": f"all:{keyword}",
        "sortBy": "submittedDate", "sortOrder": "descending",
        "max_results": "25",
    })
    url = f"{_ARXIV_API}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    ns = {"a": "http://www.w3.org/2005/Atom"}
    root = ET.fromstring(raw)
    results: list[dict] = []
    for entry in root.findall("a:entry", ns):
        entry_id = (entry.findtext("a:id", "", ns) or "").strip()
        title = (entry.findtext("a:title", "", ns) or "").strip()
        title = " ".join(title.split())  # collapse whitespace
        published = entry.findtext("a:published", "", ns) or ""
        link_url = ""
        for link in entry.findall("a:link", ns):
            if link.get("type") == "text/html":
                link_url = link.get("href", "")
                break
        if not link_url:
            link_url = entry_id
        arxiv_id = entry_id.rsplit("/abs/", 1)[-1] if "/abs/" in entry_id else entry_id
        date = _parse_date(published)
        if title:
            results.append({
                "id": arxiv_id, "title": title, "url": link_url,
                "date": date, "extra": "",
            })
    return results
# -- Lobsters search (blocking) --------------------------------------------
class _LobstersParser(HTMLParser):
    """Extract story links from Lobsters search HTML."""
    def __init__(self):
        super().__init__()
        self.results: list[tuple[str, str]] = []
        self._in_link = False
        self._url = ""
        self._title_parts: list[str] = []
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag != "a":
            return
        attr_map = {k: (v or "") for k, v in attrs}
        cls = attr_map.get("class", "")
        if "u-url" in cls:
            self._in_link = True
            self._url = attr_map.get("href", "")
            self._title_parts = []
    def handle_data(self, data: str) -> None:
        if self._in_link:
            self._title_parts.append(data)
    def handle_endtag(self, tag: str) -> None:
        if tag == "a" and self._in_link:
            self._in_link = False
            title = "".join(self._title_parts).strip()
            if self._url and title:
                self.results.append((self._url, title))
def _search_lobsters(keyword: str) -> list[dict]:
    """Search Lobsters stories via HTML search page. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({
        "q": keyword, "what": "stories", "order": "newest",
    })
    url = f"{_LOBSTERS_SEARCH_URL}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    html = raw.decode("utf-8", errors="replace")
    parser = _LobstersParser()
    parser.feed(html)
    results: list[dict] = []
    seen_urls: set[str] = set()
    for item_url, title in parser.results:
        if item_url in seen_urls:
            continue
        seen_urls.add(item_url)
        results.append({
            "id": item_url,
            "title": title,
            "url": item_url,
            "date": "",
            "extra": "",
        })
    return results
# -- DEV.to search (blocking) ----------------------------------------------
def _search_devto(keyword: str) -> list[dict]:
    """Search DEV.to articles via public articles API. Blocking."""
    import urllib.parse
    tag = re.sub(r"[^a-zA-Z0-9]", "", keyword).lower()
    params = urllib.parse.urlencode({"per_page": "25", "tag": tag})
    url = f"{_DEVTO_API}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    if not isinstance(data, list):
        return []
    results: list[dict] = []
    for item in data:
        article_id = str(item.get("id", ""))
        if not article_id:
            continue
        title = item.get("title", "")
        article_url = item.get("url", "")
        user = item.get("user", {})
        if isinstance(user, dict):
            author = user.get("username", "")
        else:
            author = ""
        if author:
            title = f"{author}: {title}"
        date = _parse_date(item.get("published_at", ""))
        results.append({
            "id": article_id, "title": title, "url": article_url,
            "date": date, "extra": "",
        })
    return results
# -- Medium tag feed search (blocking) -------------------------------------
def _search_medium(keyword: str) -> list[dict]:
    """Search Medium via tag RSS feed. Blocking."""
    import urllib.parse
    import xml.etree.ElementTree as ET
    tag = re.sub(r"[^a-zA-Z0-9-]", "-", keyword).lower().strip("-")
    if not tag:
        return []
    url = f"{_MEDIUM_FEED_URL}/{urllib.parse.quote(tag, safe='')}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    root = ET.fromstring(raw)
    results: list[dict] = []
    for item in root.iter("item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        if not link:
            continue
        guid = (item.findtext("guid") or link).strip()
        creator = item.findtext("{http://purl.org/dc/elements/1.1/}creator") or ""
        if creator:
            title = f"{creator}: {title}"
        pub_date = item.findtext("pubDate") or ""
        date = _parse_date(pub_date)
        if not date and pub_date:
            from email.utils import parsedate_to_datetime
            try:
                dt = parsedate_to_datetime(pub_date)
                date = dt.strftime("%Y-%m-%d")
            except (ValueError, TypeError):
                pass
        results.append({
            "id": guid, "title": title, "url": link,
            "date": date, "extra": "",
        })
    return results
# -- Hugging Face search (blocking) ----------------------------------------
def _search_huggingface(keyword: str) -> list[dict]:
    """Search Hugging Face models via public API. Blocking."""
    import urllib.parse
    params = urllib.parse.urlencode({
        "search": keyword, "sort": "lastModified",
        "direction": "-1", "limit": "25",
    })
    url = f"{_HUGGINGFACE_API}?{params}"
    req = urllib.request.Request(url, method="GET")
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; derp-bot)")
    resp = _urlopen(req, timeout=_FETCH_TIMEOUT)
    raw = resp.read()
    resp.close()
    data = json.loads(raw)
    results: list[dict] = []
    for model in data if isinstance(data, list) else []:
        model_id = model.get("modelId") or model.get("id", "")
        if not model_id:
            continue
        downloads = model.get("downloads", 0)
        likes = model.get("likes", 0)
        title = model_id
        if downloads:
            title += f" [{downloads} dl]"
        elif likes:
            title += f" [{likes} likes]"
        date = _parse_date(model.get("lastModified", ""))
        results.append({
            "id": model_id,
            "title": title,
            "url": f"https://huggingface.co/{model_id}",
            "date": date,
            "extra": "",
        })
    return results
# -- Backend registry -------------------------------------------------------
_BACKENDS: dict[str, callable] = {
							
							
							
								
							
						
@@ -1144,6 +1652,17 @@ _BACKENDS: dict[str, callable] = {
    "ia": _search_archive,
    "hn": _search_hackernews,
    "gh": _search_github,
    "wp": _search_wikipedia,
    "se": _search_stackexchange,
    "gl": _search_gitlab,
    "nm": _search_npm,
    "pp": _search_pypi,
    "dh": _search_dockerhub,
    "ax": _search_arxiv,
    "lb": _search_lobsters,
    "dv": _search_devto,
    "md": _search_medium,
    "hf": _search_huggingface,
}