From d0c647918616f4d5f133f5bf07032d477de3c8f0 Mon Sep 17 00:00:00 2001 From: Kiplangatkorir Date: Wed, 4 Mar 2026 11:20:50 +0300 Subject: [PATCH] feat: add LLM retry with exponential backoff for transient errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit provider.chat() had no retry logic — a transient 429 rate limit, 502 gateway error, or network timeout would permanently fail the entire message. For a system running cron jobs and heartbeats 24/7, even a brief provider blip causes lost tasks. Adds _chat_with_retry() that: - Retries up to 3 times with 1s/2s/4s exponential backoff - Only retries transient errors (429, 5xx, timeout, connection) - Returns immediately on permanent errors (400, 401, etc.) - Falls through to the final attempt if all retries exhaust --- nanobot/agent/loop.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index 65a62e5..9819a38 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -159,6 +159,33 @@ class AgentLoop: if hasattr(tool, "set_context"): tool.set_context(channel, chat_id, *([message_id] if name == "message" else [])) + _RETRY_DELAYS = (1, 2, 4) # seconds — exponential backoff for transient LLM errors + + async def _chat_with_retry(self, **kwargs: Any) -> Any: + """Call provider.chat() with retry on transient errors (429, 5xx, network).""" + from nanobot.providers.base import LLMResponse + + last_response: LLMResponse | None = None + for attempt, delay in enumerate(self._RETRY_DELAYS): + response = await self.provider.chat(**kwargs) + if response.finish_reason != "error": + return response + # Check if the error looks transient (rate limit, server error, network) + err = (response.content or "").lower() + is_transient = any(kw in err for kw in ( + "429", "rate limit", "500", "502", "503", "504", + "overloaded", "timeout", "connection", "server error", + )) + if not is_transient: + return response # permanent error (400, 401, etc.) — don't retry + last_response = response + logger.warning("LLM transient error (attempt {}/{}), retrying in {}s: {}", + attempt + 1, len(self._RETRY_DELAYS), delay, err[:120]) + await asyncio.sleep(delay) + # All retries exhausted — make one final attempt + response = await self.provider.chat(**kwargs) + return response if response.finish_reason != "error" else (last_response or response) + @staticmethod def _strip_think(text: str | None) -> str | None: """Remove blocks that some models embed in content.""" @@ -191,7 +218,7 @@ class AgentLoop: while iteration < self.max_iterations: iteration += 1 - response = await self.provider.chat( + response = await self._chat_with_retry( messages=messages, tools=self.tools.get_definitions(), model=self.model,