feat: add LLM retry with exponential backoff for transient errors

provider.chat() had no retry logic — a transient 429 rate limit, 502 gateway error, or network timeout would permanently fail the entire message. For a system running cron jobs and heartbeats 24/7, even a brief provider blip causes lost tasks. Adds _chat_with_retry() that: - Retries up to 3 times with 1s/2s/4s exponential backoff - Only retries transient errors (429, 5xx, timeout, connection) - Returns immediately on permanent errors (400, 401, etc.) - Falls through to the final attempt if all retries exhaust
2026-03-04 11:20:50 +03:00
parent 1303cc6669
commit d0c6479186
1 changed files with 28 additions and 1 deletions
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -159,6 +159,33 @@ class AgentLoop:
                if hasattr(tool, "set_context"):
                    tool.set_context(channel, chat_id, *([message_id] if name == "message" else []))
    _RETRY_DELAYS = (1, 2, 4)  # seconds — exponential backoff for transient LLM errors
    async def _chat_with_retry(self, **kwargs: Any) -> Any:
        """Call provider.chat() with retry on transient errors (429, 5xx, network)."""
        from nanobot.providers.base import LLMResponse
        last_response: LLMResponse | None = None
        for attempt, delay in enumerate(self._RETRY_DELAYS):
            response = await self.provider.chat(**kwargs)
            if response.finish_reason != "error":
                return response
            # Check if the error looks transient (rate limit, server error, network)
            err = (response.content or "").lower()
            is_transient = any(kw in err for kw in (
                "429", "rate limit", "500", "502", "503", "504",
                "overloaded", "timeout", "connection", "server error",
            ))
            if not is_transient:
                return response  # permanent error (400, 401, etc.) — don't retry
            last_response = response
            logger.warning("LLM transient error (attempt {}/{}), retrying in {}s: {}",
                           attempt + 1, len(self._RETRY_DELAYS), delay, err[:120])
            await asyncio.sleep(delay)
        # All retries exhausted — make one final attempt
        response = await self.provider.chat(**kwargs)
        return response if response.finish_reason != "error" else (last_response or response)
    @staticmethod
    def _strip_think(text: str | None) -> str | None:
        """Remove <think>…</think> blocks that some models embed in content."""
@@ -191,7 +218,7 @@ class AgentLoop:
        while iteration < self.max_iterations:
            iteration += 1
-            response = await self.provider.chat(
+            response = await self._chat_with_retry(
                messages=messages,
                tools=self.tools.get_definitions(),
                model=self.model,