From d0c647918616f4d5f133f5bf07032d477de3c8f0 Mon Sep 17 00:00:00 2001
From: Kiplangatkorir <korirkiplangat22@gmail.com>
Date: Wed, 4 Mar 2026 11:20:50 +0300
Subject: [PATCH] feat: add LLM retry with exponential backoff for transient
 errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

provider.chat() had no retry logic — a transient 429 rate limit,
502 gateway error, or network timeout would permanently fail the
entire message. For a system running cron jobs and heartbeats 24/7,
even a brief provider blip causes lost tasks.

Adds _chat_with_retry() that:
- Retries up to 3 times with 1s/2s/4s exponential backoff
- Only retries transient errors (429, 5xx, timeout, connection)
- Returns immediately on permanent errors (400, 401, etc.)
- Falls through to the final attempt if all retries exhaust
---
 nanobot/agent/loop.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 65a62e5..9819a38 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -159,6 +159,33 @@ class AgentLoop:
                 if hasattr(tool, "set_context"):
                     tool.set_context(channel, chat_id, *([message_id] if name == "message" else []))
 
+    _RETRY_DELAYS = (1, 2, 4)  # seconds — exponential backoff for transient LLM errors
+
+    async def _chat_with_retry(self, **kwargs: Any) -> Any:
+        """Call provider.chat() with retry on transient errors (429, 5xx, network)."""
+        from nanobot.providers.base import LLMResponse
+
+        last_response: LLMResponse | None = None
+        for attempt, delay in enumerate(self._RETRY_DELAYS):
+            response = await self.provider.chat(**kwargs)
+            if response.finish_reason != "error":
+                return response
+            # Check if the error looks transient (rate limit, server error, network)
+            err = (response.content or "").lower()
+            is_transient = any(kw in err for kw in (
+                "429", "rate limit", "500", "502", "503", "504",
+                "overloaded", "timeout", "connection", "server error",
+            ))
+            if not is_transient:
+                return response  # permanent error (400, 401, etc.) — don't retry
+            last_response = response
+            logger.warning("LLM transient error (attempt {}/{}), retrying in {}s: {}",
+                           attempt + 1, len(self._RETRY_DELAYS), delay, err[:120])
+            await asyncio.sleep(delay)
+        # All retries exhausted — make one final attempt
+        response = await self.provider.chat(**kwargs)
+        return response if response.finish_reason != "error" else (last_response or response)
+
     @staticmethod
     def _strip_think(text: str | None) -> str | None:
         """Remove <think>…</think> blocks that some models embed in content."""
@@ -191,7 +218,7 @@ class AgentLoop:
         while iteration < self.max_iterations:
             iteration += 1
 
-            response = await self.provider.chat(
+            response = await self._chat_with_retry(
                 messages=messages,
                 tools=self.tools.get_definitions(),
                 model=self.model,