fix: track alert backend errors independently
Per-backend error counts with exponential backoff: after 5 consecutive failures a backend is skipped every 2^(n-5) cycles (capped at 32). Working backends are no longer penalized by one flaky backend doubling the entire poll interval. Migrates last_error (string) to last_errors (dict per backend). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -80,7 +80,8 @@ _HUGGINGFACE_API = "https://huggingface.co/api/models"
|
|||||||
|
|
||||||
_pollers: dict[str, asyncio.Task] = {}
|
_pollers: dict[str, asyncio.Task] = {}
|
||||||
_subscriptions: dict[str, dict] = {}
|
_subscriptions: dict[str, dict] = {}
|
||||||
_errors: dict[str, int] = {}
|
_errors: dict[str, dict[str, int]] = {}
|
||||||
|
_poll_count: dict[str, int] = {}
|
||||||
|
|
||||||
# -- History database --------------------------------------------------------
|
# -- History database --------------------------------------------------------
|
||||||
|
|
||||||
@@ -1711,17 +1712,27 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
|
|||||||
now = datetime.now(timezone.utc).isoformat()
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
data["last_poll"] = now
|
data["last_poll"] = now
|
||||||
|
|
||||||
had_error = False
|
cycle = _poll_count[key] = _poll_count.get(key, 0) + 1
|
||||||
|
tag_errors = _errors.setdefault(key, {})
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
for tag, backend in _BACKENDS.items():
|
for tag, backend in _BACKENDS.items():
|
||||||
|
errs = tag_errors.get(tag, 0)
|
||||||
|
if errs >= 5:
|
||||||
|
skip = min(2 ** (errs - 5), 32)
|
||||||
|
if cycle % skip != 0:
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
items = await loop.run_in_executor(None, backend, keyword)
|
items = await loop.run_in_executor(None, backend, keyword)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
data["last_error"] = f"{tag}: {exc}"
|
tag_errors[tag] = errs + 1
|
||||||
had_error = True
|
data.setdefault("last_errors", {})[tag] = str(exc)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
tag_errors[tag] = 0
|
||||||
|
data.setdefault("last_errors", {}).pop(tag, None)
|
||||||
|
|
||||||
seen_set = set(data.get("seen", {}).get(tag, []))
|
seen_set = set(data.get("seen", {}).get(tag, []))
|
||||||
seen_list = list(data.get("seen", {}).get(tag, []))
|
seen_list = list(data.get("seen", {}).get(tag, []))
|
||||||
new_items = [item for item in items if item["id"] not in seen_set]
|
new_items = [item for item in items if item["id"] not in seen_set]
|
||||||
@@ -1795,12 +1806,6 @@ async def _poll_once(bot, key: str, announce: bool = True) -> None:
|
|||||||
seen_list = seen_list[-_MAX_SEEN:]
|
seen_list = seen_list[-_MAX_SEEN:]
|
||||||
data.setdefault("seen", {})[tag] = seen_list
|
data.setdefault("seen", {})[tag] = seen_list
|
||||||
|
|
||||||
if had_error:
|
|
||||||
_errors[key] = _errors.get(key, 0) + 1
|
|
||||||
else:
|
|
||||||
data["last_error"] = ""
|
|
||||||
_errors[key] = 0
|
|
||||||
|
|
||||||
_subscriptions[key] = data
|
_subscriptions[key] = data
|
||||||
_save(bot, key, data)
|
_save(bot, key, data)
|
||||||
|
|
||||||
@@ -1813,9 +1818,6 @@ async def _poll_loop(bot, key: str) -> None:
|
|||||||
if data is None:
|
if data is None:
|
||||||
return
|
return
|
||||||
interval = data.get("interval", _DEFAULT_INTERVAL)
|
interval = data.get("interval", _DEFAULT_INTERVAL)
|
||||||
errs = _errors.get(key, 0)
|
|
||||||
if errs >= 5:
|
|
||||||
interval = min(interval * 2, _MAX_INTERVAL)
|
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
await _poll_once(bot, key, announce=True)
|
await _poll_once(bot, key, announce=True)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
@@ -1837,7 +1839,8 @@ def _stop_poller(key: str) -> None:
|
|||||||
if task and not task.done():
|
if task and not task.done():
|
||||||
task.cancel()
|
task.cancel()
|
||||||
_subscriptions.pop(key, None)
|
_subscriptions.pop(key, None)
|
||||||
_errors.pop(key, 0)
|
_errors.pop(key, None)
|
||||||
|
_poll_count.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
# -- Restore on connect -----------------------------------------------------
|
# -- Restore on connect -----------------------------------------------------
|
||||||
@@ -1895,9 +1898,9 @@ async def cmd_alert(bot, message):
|
|||||||
data = _load(bot, key)
|
data = _load(bot, key)
|
||||||
if data:
|
if data:
|
||||||
name = data["name"]
|
name = data["name"]
|
||||||
err = data.get("last_error", "")
|
errs = data.get("last_errors", {})
|
||||||
if err:
|
if errs:
|
||||||
subs.append(f"{name} (error)")
|
subs.append(f"{name} ({len(errs)} backend errors)")
|
||||||
else:
|
else:
|
||||||
subs.append(name)
|
subs.append(name)
|
||||||
if not subs:
|
if not subs:
|
||||||
@@ -1924,8 +1927,10 @@ async def cmd_alert(bot, message):
|
|||||||
_subscriptions[key] = data
|
_subscriptions[key] = data
|
||||||
await _poll_once(bot, key, announce=True)
|
await _poll_once(bot, key, announce=True)
|
||||||
data = _subscriptions.get(key, data)
|
data = _subscriptions.get(key, data)
|
||||||
if data.get("last_error"):
|
errs = data.get("last_errors", {})
|
||||||
await bot.reply(message, f"{name}: error -- {data['last_error']}")
|
if errs:
|
||||||
|
tags = ", ".join(sorted(errs))
|
||||||
|
await bot.reply(message, f"{name}: errors on {tags}")
|
||||||
else:
|
else:
|
||||||
await bot.reply(message, f"{name}: checked")
|
await bot.reply(message, f"{name}: checked")
|
||||||
return
|
return
|
||||||
@@ -2063,7 +2068,7 @@ async def cmd_alert(bot, message):
|
|||||||
"added_by": message.nick,
|
"added_by": message.nick,
|
||||||
"added_at": now,
|
"added_at": now,
|
||||||
"last_poll": now,
|
"last_poll": now,
|
||||||
"last_error": "",
|
"last_errors": {},
|
||||||
"seen": {},
|
"seen": {},
|
||||||
}
|
}
|
||||||
_save(bot, key, data)
|
_save(bot, key, data)
|
||||||
|
|||||||
Reference in New Issue
Block a user