fix: track alert backend errors independently
Per-backend error counts with exponential backoff: after 5 consecutive failures a backend is skipped every 2^(n-5) cycles (capped at 32). Working backends are no longer penalized by one flaky backend doubling the entire poll interval. Migrates last_error (string) to last_errors (dict per backend). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -80,7 +80,8 @@ _HUGGINGFACE_API = "https://huggingface.co/api/models"
|
||||
|
||||
_pollers: dict[str, asyncio.Task] = {}
|
||||
_subscriptions: dict[str, dict] = {}
|
||||
_errors: dict[str, int] = {}
|
||||
_errors: dict[str, dict[str, int]] = {}
|
||||
_poll_count: dict[str, int] = {}
|
||||
|
||||
# -- History database --------------------------------------------------------
|
||||
|
||||
@@ -1711,17 +1712,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
data["last_poll"] = now
|
||||
|
||||
had_error = False
|
||||
cycle = _poll_count[key] = _poll_count.get(key, 0) + 1
|
||||
tag_errors = _errors.setdefault(key, {})
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
for tag, backend in _BACKENDS.items():
|
||||
errs = tag_errors.get(tag, 0)
|
||||
if errs >= 5:
|
||||
skip = min(2 ** (errs - 5), 32)
|
||||
if cycle % skip != 0:
|
||||
continue
|
||||
|
||||
try:
|
||||
items = await loop.run_in_executor(None, backend, keyword)
|
||||
except Exception as exc:
|
||||
data["last_error"] = f"{tag}: {exc}"
|
||||
had_error = True
|
||||
tag_errors[tag] = errs + 1
|
||||
data.setdefault("last_errors", {})[tag] = str(exc)
|
||||
continue
|
||||
|
||||
tag_errors[tag] = 0
|
||||
data.setdefault("last_errors", {}).pop(tag, None)
|
||||
|
||||
seen_set = set(data.get("seen", {}).get(tag, []))
|
||||
seen_list = list(data.get("seen", {}).get(tag, []))
|
||||
new_items = [item for item in items if item["id"] not in seen_set]
|
||||
@@ -1795,12 +1806,6 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
|
||||
seen_list = seen_list[-_MAX_SEEN:]
|
||||
data.setdefault("seen", {})[tag] = seen_list
|
||||
|
||||
if had_error:
|
||||
_errors[key] = _errors.get(key, 0) + 1
|
||||
else:
|
||||
data["last_error"] = ""
|
||||
_errors[key] = 0
|
||||
|
||||
_subscriptions[key] = data
|
||||
_save(bot, key, data)
|
||||
|
||||
@@ -1813,9 +1818,6 @@ async def _poll_loop(bot, key: str) -> None:
|
||||
if data is None:
|
||||
return
|
||||
interval = data.get("interval", _DEFAULT_INTERVAL)
|
||||
errs = _errors.get(key, 0)
|
||||
if errs >= 5:
|
||||
interval = min(interval * 2, _MAX_INTERVAL)
|
||||
await asyncio.sleep(interval)
|
||||
await _poll_once(bot, key, announce=True)
|
||||
except asyncio.CancelledError:
|
||||
@@ -1837,7 +1839,8 @@ def _stop_poller(key: str) -> None:
|
||||
if task and not task.done():
|
||||
task.cancel()
|
||||
_subscriptions.pop(key, None)
|
||||
_errors.pop(key, 0)
|
||||
_errors.pop(key, None)
|
||||
_poll_count.pop(key, None)
|
||||
|
||||
|
||||
# -- Restore on connect -----------------------------------------------------
|
||||
@@ -1895,9 +1898,9 @@ async def cmd_alert(bot, message):
|
||||
data = _load(bot, key)
|
||||
if data:
|
||||
name = data["name"]
|
||||
err = data.get("last_error", "")
|
||||
if err:
|
||||
subs.append(f"{name} (error)")
|
||||
errs = data.get("last_errors", {})
|
||||
if errs:
|
||||
subs.append(f"{name} ({len(errs)} backend errors)")
|
||||
else:
|
||||
subs.append(name)
|
||||
if not subs:
|
||||
@@ -1924,8 +1927,10 @@ async def cmd_alert(bot, message):
|
||||
_subscriptions[key] = data
|
||||
await _poll_once(bot, key, announce=True)
|
||||
data = _subscriptions.get(key, data)
|
||||
if data.get("last_error"):
|
||||
await bot.reply(message, f"{name}: error -- {data['last_error']}")
|
||||
errs = data.get("last_errors", {})
|
||||
if errs:
|
||||
tags = ", ".join(sorted(errs))
|
||||
await bot.reply(message, f"{name}: errors on {tags}")
|
||||
else:
|
||||
await bot.reply(message, f"{name}: checked")
|
||||
return
|
||||
@@ -2063,7 +2068,7 @@ async def cmd_alert(bot, message):
|
||||
"added_by": message.nick,
|
||||
"added_at": now,
|
||||
"last_poll": now,
|
||||
"last_error": "",
|
||||
"last_errors": {},
|
||||
"seen": {},
|
||||
}
|
||||
_save(bot, key, data)
|
||||
|
||||
Reference in New Issue
Block a user