fix: track alert backend errors independently

Per-backend error counts with exponential backoff: after 5 consecutive
failures a backend is skipped every 2^(n-5) cycles (capped at 32).
Working backends are no longer penalized by one flaky backend doubling
the entire poll interval.

Migrates last_error (string) to last_errors (dict per backend).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
user
2026-02-17 10:51:42 +01:00
parent f2199f2bec
commit da908a45e4

View File

@@ -80,7 +80,8 @@ _HUGGINGFACE_API = "https://huggingface.co/api/models"
_pollers: dict[str, asyncio.Task] = {}
_subscriptions: dict[str, dict] = {}
_errors: dict[str, int] = {}
_errors: dict[str, dict[str, int]] = {}
_poll_count: dict[str, int] = {}
# -- History database --------------------------------------------------------
@@ -1711,17 +1712,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
now = datetime.now(timezone.utc).isoformat()
data["last_poll"] = now
had_error = False
cycle = _poll_count[key] = _poll_count.get(key, 0) + 1
tag_errors = _errors.setdefault(key, {})
loop = asyncio.get_running_loop()
for tag, backend in _BACKENDS.items():
errs = tag_errors.get(tag, 0)
if errs >= 5:
skip = min(2 ** (errs - 5), 32)
if cycle % skip != 0:
continue
try:
items = await loop.run_in_executor(None, backend, keyword)
except Exception as exc:
data["last_error"] = f"{tag}: {exc}"
had_error = True
tag_errors[tag] = errs + 1
data.setdefault("last_errors", {})[tag] = str(exc)
continue
tag_errors[tag] = 0
data.setdefault("last_errors", {}).pop(tag, None)
seen_set = set(data.get("seen", {}).get(tag, []))
seen_list = list(data.get("seen", {}).get(tag, []))
new_items = [item for item in items if item["id"] not in seen_set]
@@ -1795,12 +1806,6 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
seen_list = seen_list[-_MAX_SEEN:]
data.setdefault("seen", {})[tag] = seen_list
if had_error:
_errors[key] = _errors.get(key, 0) + 1
else:
data["last_error"] = ""
_errors[key] = 0
_subscriptions[key] = data
_save(bot, key, data)
@@ -1813,9 +1818,6 @@ async def _poll_loop(bot, key: str) -> None:
if data is None:
return
interval = data.get("interval", _DEFAULT_INTERVAL)
errs = _errors.get(key, 0)
if errs >= 5:
interval = min(interval * 2, _MAX_INTERVAL)
await asyncio.sleep(interval)
await _poll_once(bot, key, announce=True)
except asyncio.CancelledError:
@@ -1837,7 +1839,8 @@ def _stop_poller(key: str) -> None:
if task and not task.done():
task.cancel()
_subscriptions.pop(key, None)
_errors.pop(key, 0)
_errors.pop(key, None)
_poll_count.pop(key, None)
# -- Restore on connect -----------------------------------------------------
@@ -1895,9 +1898,9 @@ async def cmd_alert(bot, message):
data = _load(bot, key)
if data:
name = data["name"]
err = data.get("last_error", "")
if err:
subs.append(f"{name} (error)")
errs = data.get("last_errors", {})
if errs:
subs.append(f"{name} ({len(errs)} backend errors)")
else:
subs.append(name)
if not subs:
@@ -1924,8 +1927,10 @@ async def cmd_alert(bot, message):
_subscriptions[key] = data
await _poll_once(bot, key, announce=True)
data = _subscriptions.get(key, data)
if data.get("last_error"):
await bot.reply(message, f"{name}: error -- {data['last_error']}")
errs = data.get("last_errors", {})
if errs:
tags = ", ".join(sorted(errs))
await bot.reply(message, f"{name}: errors on {tags}")
else:
await bot.reply(message, f"{name}: checked")
return
@@ -2063,7 +2068,7 @@ async def cmd_alert(bot, message):
"added_by": message.nick,
"added_at": now,
"last_poll": now,
"last_error": "",
"last_errors": {},
"seen": {},
}
_save(bot, key, data)