feat: add LLM retry with exponential backoff for transient errors

provider.chat() had no retry logic — a transient 429 rate limit,
502 gateway error, or network timeout would permanently fail the
entire message. For a system running cron jobs and heartbeats 24/7,
even a brief provider blip causes lost tasks.

Adds _chat_with_retry() that:
- Retries up to 3 times with 1s/2s/4s exponential backoff
- Only retries transient errors (429, 5xx, timeout, connection)
- Returns immediately on permanent errors (400, 401, etc.)
- Falls through to the final attempt if all retries exhaust
This commit is contained in:
Kiplangatkorir
2026-03-04 11:20:50 +03:00
parent 1303cc6669
commit d0c6479186

View File

@@ -159,6 +159,33 @@ class AgentLoop:
if hasattr(tool, "set_context"): if hasattr(tool, "set_context"):
tool.set_context(channel, chat_id, *([message_id] if name == "message" else [])) tool.set_context(channel, chat_id, *([message_id] if name == "message" else []))
_RETRY_DELAYS = (1, 2, 4) # seconds — exponential backoff for transient LLM errors
async def _chat_with_retry(self, **kwargs: Any) -> Any:
"""Call provider.chat() with retry on transient errors (429, 5xx, network)."""
from nanobot.providers.base import LLMResponse
last_response: LLMResponse | None = None
for attempt, delay in enumerate(self._RETRY_DELAYS):
response = await self.provider.chat(**kwargs)
if response.finish_reason != "error":
return response
# Check if the error looks transient (rate limit, server error, network)
err = (response.content or "").lower()
is_transient = any(kw in err for kw in (
"429", "rate limit", "500", "502", "503", "504",
"overloaded", "timeout", "connection", "server error",
))
if not is_transient:
return response # permanent error (400, 401, etc.) — don't retry
last_response = response
logger.warning("LLM transient error (attempt {}/{}), retrying in {}s: {}",
attempt + 1, len(self._RETRY_DELAYS), delay, err[:120])
await asyncio.sleep(delay)
# All retries exhausted — make one final attempt
response = await self.provider.chat(**kwargs)
return response if response.finish_reason != "error" else (last_response or response)
@staticmethod @staticmethod
def _strip_think(text: str | None) -> str | None: def _strip_think(text: str | None) -> str | None:
"""Remove <think>…</think> blocks that some models embed in content.""" """Remove <think>…</think> blocks that some models embed in content."""
@@ -191,7 +218,7 @@ class AgentLoop:
while iteration < self.max_iterations: while iteration < self.max_iterations:
iteration += 1 iteration += 1
response = await self.provider.chat( response = await self._chat_with_retry(
messages=messages, messages=messages,
tools=self.tools.get_definitions(), tools=self.tools.get_definitions(),
model=self.model, model=self.model,