diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py
index 6cf2ec3..a892d3d 100644
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -115,6 +115,7 @@ class AgentLoop:
             context_window_tokens=context_window_tokens,
             build_messages=self.context.build_messages,
             get_tool_definitions=self.tools.get_definitions,
+            max_completion_tokens=provider.generation.max_tokens,
         )
         self._register_default_tools()
 
diff --git a/nanobot/agent/memory.py b/nanobot/agent/memory.py
index 5fdfa7a..aa2de92 100644
--- a/nanobot/agent/memory.py
+++ b/nanobot/agent/memory.py
@@ -224,6 +224,8 @@ class MemoryConsolidator:
 
     _MAX_CONSOLIDATION_ROUNDS = 5
 
+    _SAFETY_BUFFER = 1024  # extra headroom for tokenizer estimation drift
+
     def __init__(
         self,
         workspace: Path,
@@ -233,12 +235,14 @@ class MemoryConsolidator:
         context_window_tokens: int,
         build_messages: Callable[..., list[dict[str, Any]]],
         get_tool_definitions: Callable[[], list[dict[str, Any]]],
+        max_completion_tokens: int = 4096,
     ):
         self.store = MemoryStore(workspace)
         self.provider = provider
         self.model = model
         self.sessions = sessions
         self.context_window_tokens = context_window_tokens
+        self.max_completion_tokens = max_completion_tokens
         self._build_messages = build_messages
         self._get_tool_definitions = get_tool_definitions
         self._locks: weakref.WeakValueDictionary[str, asyncio.Lock] = weakref.WeakValueDictionary()
@@ -300,17 +304,22 @@ class MemoryConsolidator:
         return True
 
     async def maybe_consolidate_by_tokens(self, session: Session) -> None:
-        """Loop: archive old messages until prompt fits within half the context window."""
+        """Loop: archive old messages until prompt fits within safe budget.
+
+        The budget reserves space for completion tokens and a safety buffer
+        so the LLM request never exceeds the context window.
+        """
         if not session.messages or self.context_window_tokens <= 0:
             return
 
         lock = self.get_lock(session.key)
         async with lock:
-            target = self.context_window_tokens // 2
+            budget = self.context_window_tokens - self.max_completion_tokens - self._SAFETY_BUFFER
+            target = budget // 2
             estimated, source = self.estimate_session_prompt_tokens(session)
             if estimated <= 0:
                 return
-            if estimated < self.context_window_tokens:
+            if estimated < budget:
                 logger.debug(
                     "Token consolidation idle {}: {}/{} via {}",
                     session.key,
diff --git a/tests/test_loop_consolidation_tokens.py b/tests/test_loop_consolidation_tokens.py
index 87d8d29..2f9c2de 100644
--- a/tests/test_loop_consolidation_tokens.py
+++ b/tests/test_loop_consolidation_tokens.py
@@ -9,8 +9,10 @@ from nanobot.providers.base import LLMResponse
 
 
 def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -> AgentLoop:
+    from nanobot.providers.base import GenerationSettings
     provider = MagicMock()
     provider.get_default_model.return_value = "test-model"
+    provider.generation = GenerationSettings(max_tokens=0)
     provider.estimate_prompt_tokens.return_value = (estimated_tokens, "test-counter")
     _response = LLMResponse(content="ok", tool_calls=[])
     provider.chat_with_retry = AsyncMock(return_value=_response)
@@ -24,6 +26,7 @@ def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -
         context_window_tokens=context_window_tokens,
     )
     loop.tools.get_definitions = MagicMock(return_value=[])
+    loop.memory_consolidator._SAFETY_BUFFER = 0
     return loop