diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 9e34729..c674513 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -155,6 +155,7 @@ class AgentLoop:
             context_window_tokens=context_window_tokens,
             build_messages=self.context.build_messages,
             get_tool_definitions=self.tools.get_definitions,
+            max_completion_tokens=provider.generation.max_tokens,
         )
         self._register_default_tools()
 
diff --git a/nanobot/agent/memory.py b/nanobot/agent/memory.py
index ae003a0..0e051b4 100644
--- a/nanobot/agent/memory.py
+++ b/nanobot/agent/memory.py
@@ -228,6 +228,8 @@ class MemoryConsolidator:
 
     _MAX_CONSOLIDATION_ROUNDS = 5
 
+    _SAFETY_BUFFER = 1024  # extra headroom for tokenizer estimation drift
+
     def __init__(
         self,
         workspace: Path,
@@ -237,12 +239,14 @@ class MemoryConsolidator:
         context_window_tokens: int,
         build_messages: Callable[..., list[dict[str, Any]]],
         get_tool_definitions: Callable[[], list[dict[str, Any]]],
+        max_completion_tokens: int = 4096,
     ):
         self.workspace = workspace
         self.provider = provider
         self.model = model
         self.sessions = sessions
         self.context_window_tokens = context_window_tokens
+        self.max_completion_tokens = max_completion_tokens
         self._build_messages = build_messages
         self._get_tool_definitions = get_tool_definitions
         self._locks: weakref.WeakValueDictionary[str, asyncio.Lock] = weakref.WeakValueDictionary()
@@ -356,17 +360,22 @@ class MemoryConsolidator:
             return await self._archive_messages_locked(session, snapshot)
 
     async def maybe_consolidate_by_tokens(self, session: Session) -> None:
-        """Loop: archive old messages until prompt fits within half the context window."""
+        """Loop: archive old messages until prompt fits within safe budget.
+
+        The budget reserves space for completion tokens and a safety buffer
+        so the LLM request never exceeds the context window.
+        """
         if not session.messages or self.context_window_tokens <= 0:
             return
 
         lock = self.get_lock(session.key)
         async with lock:
-            target = self.context_window_tokens // 2
+            budget = self.context_window_tokens - self.max_completion_tokens - self._SAFETY_BUFFER
+            target = budget // 2
             estimated, source = self.estimate_session_prompt_tokens(session)
             if estimated <= 0:
                 return
-            if estimated < self.context_window_tokens:
+            if estimated < budget:
                 logger.debug(
                     "Token consolidation idle {}: {}/{} via {}",
                     session.key,
diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py
index 4b62130..b51b61d 100644
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -785,6 +785,7 @@ def agent(
                 on_stream_end=renderer.on_end,
             )
             if not renderer.streamed:
+                await renderer.close()
                 _print_agent_response(
                     response.content if response else "",
                     render_markdown=markdown,
@@ -906,9 +907,13 @@ def agent(
                         if turn_response:
                             content, meta = turn_response[0]
                             if content and not meta.get("_streamed"):
+                                if renderer:
+                                    await renderer.close()
                                 _print_agent_response(
                                     content, render_markdown=markdown, metadata=meta,
                                 )
+                        elif renderer and not renderer.streamed:
+                            await renderer.close()
                     except KeyboardInterrupt:
                         _restore_terminal()
                         console.print("\nGoodbye!")
diff --git a/nanobot/cli/stream.py b/nanobot/cli/stream.py
index 161d530..16586ec 100644
--- a/nanobot/cli/stream.py
+++ b/nanobot/cli/stream.py
@@ -119,3 +119,10 @@ class StreamRenderer:
             self._start_spinner()
         else:
             _make_console().print()
+
+    async def close(self) -> None:
+        """Stop spinner/live without rendering a final streamed round."""
+        if self._live:
+            self._live.stop()
+            self._live = None
+        self._stop_spinner()
diff --git a/tests/test_loop_consolidation_tokens.py b/tests/test_loop_consolidation_tokens.py
index 5151972..d0bc942 100644
--- a/tests/test_loop_consolidation_tokens.py
+++ b/tests/test_loop_consolidation_tokens.py
@@ -10,8 +10,10 @@ from nanobot.providers.base import LLMResponse
 
 
 def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -> AgentLoop:
+    from nanobot.providers.base import GenerationSettings
     provider = MagicMock()
     provider.get_default_model.return_value = "test-model"
+    provider.generation = GenerationSettings(max_tokens=0)
     provider.estimate_prompt_tokens.return_value = (estimated_tokens, "test-counter")
     _response = LLMResponse(content="ok", tool_calls=[])
     provider.chat_with_retry = AsyncMock(return_value=_response)
@@ -25,6 +27,7 @@ def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -
         context_window_tokens=context_window_tokens,
     )
     loop.tools.get_definitions = MagicMock(return_value=[])
+    loop.memory_consolidator._SAFETY_BUFFER = 0
     return loop