From 8f5c2d1a062dc85eb9d5521167df7b642fbb9bc3 Mon Sep 17 00:00:00 2001
From: Xubin Ren <xubinrencs@gmail.com>
Date: Mon, 23 Mar 2026 03:27:13 +0000
Subject: [PATCH 1/2] fix(cli): stop spinner after non-streaming interactive
 replies

---
 nanobot/cli/commands.py | 5 +++++
 nanobot/cli/stream.py   | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py
index db348ed..d0ec145 100644
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -752,6 +752,7 @@ def agent(
                 on_stream_end=renderer.on_end,
             )
             if not renderer.streamed:
+                await renderer.close()
                 _print_agent_response(
                     response.content if response else "",
                     render_markdown=markdown,
@@ -873,9 +874,13 @@ def agent(
                         if turn_response:
                             content, meta = turn_response[0]
                             if content and not meta.get("_streamed"):
+                                if renderer:
+                                    await renderer.close()
                                 _print_agent_response(
                                     content, render_markdown=markdown, metadata=meta,
                                 )
+                        elif renderer and not renderer.streamed:
+                            await renderer.close()
                     except KeyboardInterrupt:
                         _restore_terminal()
                         console.print("\nGoodbye!")
diff --git a/nanobot/cli/stream.py b/nanobot/cli/stream.py
index 161d530..16586ec 100644
--- a/nanobot/cli/stream.py
+++ b/nanobot/cli/stream.py
@@ -119,3 +119,10 @@ class StreamRenderer:
             self._start_spinner()
         else:
             _make_console().print()
+
+    async def close(self) -> None:
+        """Stop spinner/live without rendering a final streamed round."""
+        if self._live:
+            self._live.stop()
+            self._live = None
+        self._stop_spinner()

From aba0b83a77eed0c2ba7536b4c7df35c6a4f8d8d9 Mon Sep 17 00:00:00 2001
From: Xubin Ren <xubinrencs@gmail.com>
Date: Mon, 23 Mar 2026 03:48:12 +0000
Subject: [PATCH 2/2] fix(memory): reserve completion headroom for
 consolidation

Trigger token consolidation before prompt usage reaches the full context window so response tokens and tokenizer estimation drift still fit safely within the model budget.

Made-with: Cursor
---
 nanobot/agent/loop.py                   |  1 +
 nanobot/agent/memory.py                 | 15 ++++++++++++---
 tests/test_loop_consolidation_tokens.py |  3 +++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 6cf2ec3..a892d3d 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -115,6 +115,7 @@ class AgentLoop:
             context_window_tokens=context_window_tokens,
             build_messages=self.context.build_messages,
             get_tool_definitions=self.tools.get_definitions,
+            max_completion_tokens=provider.generation.max_tokens,
         )
         self._register_default_tools()
 
diff --git a/nanobot/agent/memory.py b/nanobot/agent/memory.py
index 5fdfa7a..aa2de92 100644
--- a/nanobot/agent/memory.py
+++ b/nanobot/agent/memory.py
@@ -224,6 +224,8 @@ class MemoryConsolidator:
 
     _MAX_CONSOLIDATION_ROUNDS = 5
 
+    _SAFETY_BUFFER = 1024  # extra headroom for tokenizer estimation drift
+
     def __init__(
         self,
         workspace: Path,
@@ -233,12 +235,14 @@ class MemoryConsolidator:
         context_window_tokens: int,
         build_messages: Callable[..., list[dict[str, Any]]],
         get_tool_definitions: Callable[[], list[dict[str, Any]]],
+        max_completion_tokens: int = 4096,
     ):
         self.store = MemoryStore(workspace)
         self.provider = provider
         self.model = model
         self.sessions = sessions
         self.context_window_tokens = context_window_tokens
+        self.max_completion_tokens = max_completion_tokens
         self._build_messages = build_messages
         self._get_tool_definitions = get_tool_definitions
         self._locks: weakref.WeakValueDictionary[str, asyncio.Lock] = weakref.WeakValueDictionary()
@@ -300,17 +304,22 @@ class MemoryConsolidator:
         return True
 
     async def maybe_consolidate_by_tokens(self, session: Session) -> None:
-        """Loop: archive old messages until prompt fits within half the context window."""
+        """Loop: archive old messages until prompt fits within safe budget.
+
+        The budget reserves space for completion tokens and a safety buffer
+        so the LLM request never exceeds the context window.
+        """
         if not session.messages or self.context_window_tokens <= 0:
             return
 
         lock = self.get_lock(session.key)
         async with lock:
-            target = self.context_window_tokens // 2
+            budget = self.context_window_tokens - self.max_completion_tokens - self._SAFETY_BUFFER
+            target = budget // 2
             estimated, source = self.estimate_session_prompt_tokens(session)
             if estimated <= 0:
                 return
-            if estimated < self.context_window_tokens:
+            if estimated < budget:
                 logger.debug(
                     "Token consolidation idle {}: {}/{} via {}",
                     session.key,
diff --git a/tests/test_loop_consolidation_tokens.py b/tests/test_loop_consolidation_tokens.py
index 87d8d29..2f9c2de 100644
--- a/tests/test_loop_consolidation_tokens.py
+++ b/tests/test_loop_consolidation_tokens.py
@@ -9,8 +9,10 @@ from nanobot.providers.base import LLMResponse
 
 
 def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -> AgentLoop:
+    from nanobot.providers.base import GenerationSettings
     provider = MagicMock()
     provider.get_default_model.return_value = "test-model"
+    provider.generation = GenerationSettings(max_tokens=0)
     provider.estimate_prompt_tokens.return_value = (estimated_tokens, "test-counter")
     _response = LLMResponse(content="ok", tool_calls=[])
     provider.chat_with_retry = AsyncMock(return_value=_response)
@@ -24,6 +26,7 @@ def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -
         context_window_tokens=context_window_tokens,
     )
     loop.tools.get_definitions = MagicMock(return_value=[])
+    loop.memory_consolidator._SAFETY_BUFFER = 0
     return loop