fix: track alert backend errors independently

Per-backend error counts with exponential backoff: after 5 consecutive
failures a backend is skipped every 2^(n-5) cycles (capped at 32).
Working backends are no longer penalized by one flaky backend doubling
the entire poll interval.

Migrates last_error (string) to last_errors (dict per backend).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-17 10:51:42 +01:00
parent f2199f2bec
commit da908a45e4

View File

@@ -80,7 +80,8 @@ _HUGGINGFACE_API = "https://huggingface.co/api/models"
_pollers: dict[str, asyncio.Task] = {} _pollers: dict[str, asyncio.Task] = {}
_subscriptions: dict[str, dict] = {} _subscriptions: dict[str, dict] = {}
_errors: dict[str, int] = {} _errors: dict[str, dict[str, int]] = {}
_poll_count: dict[str, int] = {}
# -- History database -------------------------------------------------------- # -- History database --------------------------------------------------------
@@ -1711,17 +1712,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
now = datetime.now(timezone.utc).isoformat() now = datetime.now(timezone.utc).isoformat()
data["last_poll"] = now data["last_poll"] = now
had_error = False cycle = _poll_count[key] = _poll_count.get(key, 0) + 1
tag_errors = _errors.setdefault(key, {})
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
for tag, backend in _BACKENDS.items(): for tag, backend in _BACKENDS.items():
errs = tag_errors.get(tag, 0)
if errs >= 5:
skip = min(2 ** (errs - 5), 32)
if cycle % skip != 0:
continue
try: try:
items = await loop.run_in_executor(None, backend, keyword) items = await loop.run_in_executor(None, backend, keyword)
except Exception as exc: except Exception as exc:
data["last_error"] = f"{tag}: {exc}" tag_errors[tag] = errs + 1
had_error = True data.setdefault("last_errors", {})[tag] = str(exc)
continue continue
tag_errors[tag] = 0
data.setdefault("last_errors", {}).pop(tag, None)
seen_set = set(data.get("seen", {}).get(tag, [])) seen_set = set(data.get("seen", {}).get(tag, []))
seen_list = list(data.get("seen", {}).get(tag, [])) seen_list = list(data.get("seen", {}).get(tag, []))
new_items = [item for item in items if item["id"] not in seen_set] new_items = [item for item in items if item["id"] not in seen_set]
@@ -1795,12 +1806,6 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
seen_list = seen_list[-_MAX_SEEN:] seen_list = seen_list[-_MAX_SEEN:]
data.setdefault("seen", {})[tag] = seen_list data.setdefault("seen", {})[tag] = seen_list
if had_error:
_errors[key] = _errors.get(key, 0) + 1
else:
data["last_error"] = ""
_errors[key] = 0
_subscriptions[key] = data _subscriptions[key] = data
_save(bot, key, data) _save(bot, key, data)
@@ -1813,9 +1818,6 @@ async def _poll_loop(bot, key: str) -> None:
if data is None: if data is None:
return return
interval = data.get("interval", _DEFAULT_INTERVAL) interval = data.get("interval", _DEFAULT_INTERVAL)
errs = _errors.get(key, 0)
if errs >= 5:
interval = min(interval * 2, _MAX_INTERVAL)
await asyncio.sleep(interval) await asyncio.sleep(interval)
await _poll_once(bot, key, announce=True) await _poll_once(bot, key, announce=True)
except asyncio.CancelledError: except asyncio.CancelledError:
@@ -1837,7 +1839,8 @@ def _stop_poller(key: str) -> None:
if task and not task.done(): if task and not task.done():
task.cancel() task.cancel()
_subscriptions.pop(key, None) _subscriptions.pop(key, None)
_errors.pop(key, 0) _errors.pop(key, None)
_poll_count.pop(key, None)
# -- Restore on connect ----------------------------------------------------- # -- Restore on connect -----------------------------------------------------
@@ -1895,9 +1898,9 @@ async def cmd_alert(bot, message):
data = _load(bot, key) data = _load(bot, key)
if data: if data:
name = data["name"] name = data["name"]
err = data.get("last_error", "") errs = data.get("last_errors", {})
if err: if errs:
subs.append(f"{name} (error)") subs.append(f"{name} ({len(errs)} backend errors)")
else: else:
subs.append(name) subs.append(name)
if not subs: if not subs:
@@ -1924,8 +1927,10 @@ async def cmd_alert(bot, message):
_subscriptions[key] = data _subscriptions[key] = data
await _poll_once(bot, key, announce=True) await _poll_once(bot, key, announce=True)
data = _subscriptions.get(key, data) data = _subscriptions.get(key, data)
if data.get("last_error"): errs = data.get("last_errors", {})
await bot.reply(message, f"{name}: error -- {data['last_error']}") if errs:
tags = ", ".join(sorted(errs))
await bot.reply(message, f"{name}: errors on {tags}")
else: else:
await bot.reply(message, f"{name}: checked") await bot.reply(message, f"{name}: checked")
return return
@@ -2063,7 +2068,7 @@ async def cmd_alert(bot, message):
"added_by": message.nick, "added_by": message.nick,
"added_at": now, "added_at": now,
"last_poll": now, "last_poll": now,
"last_error": "", "last_errors": {},
"seen": {}, "seen": {},
} }
_save(bot, key, data) _save(bot, key, data)