From 8f5c2d1a062dc85eb9d5521167df7b642fbb9bc3 Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Mon, 23 Mar 2026 03:27:13 +0000 Subject: [PATCH 1/2] fix(cli): stop spinner after non-streaming interactive replies --- nanobot/cli/commands.py | 5 +++++ nanobot/cli/stream.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py index db348ed..d0ec145 100644 --- a/nanobot/cli/commands.py +++ b/nanobot/cli/commands.py @@ -752,6 +752,7 @@ def agent( on_stream_end=renderer.on_end, ) if not renderer.streamed: + await renderer.close() _print_agent_response( response.content if response else "", render_markdown=markdown, @@ -873,9 +874,13 @@ def agent( if turn_response: content, meta = turn_response[0] if content and not meta.get("_streamed"): + if renderer: + await renderer.close() _print_agent_response( content, render_markdown=markdown, metadata=meta, ) + elif renderer and not renderer.streamed: + await renderer.close() except KeyboardInterrupt: _restore_terminal() console.print("\nGoodbye!") diff --git a/nanobot/cli/stream.py b/nanobot/cli/stream.py index 161d530..16586ec 100644 --- a/nanobot/cli/stream.py +++ b/nanobot/cli/stream.py @@ -119,3 +119,10 @@ class StreamRenderer: self._start_spinner() else: _make_console().print() + + async def close(self) -> None: + """Stop spinner/live without rendering a final streamed round.""" + if self._live: + self._live.stop() + self._live = None + self._stop_spinner() From aba0b83a77eed0c2ba7536b4c7df35c6a4f8d8d9 Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Mon, 23 Mar 2026 03:48:12 +0000 Subject: [PATCH 2/2] fix(memory): reserve completion headroom for consolidation Trigger token consolidation before prompt usage reaches the full context window so response tokens and tokenizer estimation drift still fit safely within the model budget. Made-with: Cursor --- nanobot/agent/loop.py | 1 + nanobot/agent/memory.py | 15 ++++++++++++--- tests/test_loop_consolidation_tokens.py | 3 +++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index 6cf2ec3..a892d3d 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -115,6 +115,7 @@ class AgentLoop: context_window_tokens=context_window_tokens, build_messages=self.context.build_messages, get_tool_definitions=self.tools.get_definitions, + max_completion_tokens=provider.generation.max_tokens, ) self._register_default_tools() diff --git a/nanobot/agent/memory.py b/nanobot/agent/memory.py index 5fdfa7a..aa2de92 100644 --- a/nanobot/agent/memory.py +++ b/nanobot/agent/memory.py @@ -224,6 +224,8 @@ class MemoryConsolidator: _MAX_CONSOLIDATION_ROUNDS = 5 + _SAFETY_BUFFER = 1024 # extra headroom for tokenizer estimation drift + def __init__( self, workspace: Path, @@ -233,12 +235,14 @@ class MemoryConsolidator: context_window_tokens: int, build_messages: Callable[..., list[dict[str, Any]]], get_tool_definitions: Callable[[], list[dict[str, Any]]], + max_completion_tokens: int = 4096, ): self.store = MemoryStore(workspace) self.provider = provider self.model = model self.sessions = sessions self.context_window_tokens = context_window_tokens + self.max_completion_tokens = max_completion_tokens self._build_messages = build_messages self._get_tool_definitions = get_tool_definitions self._locks: weakref.WeakValueDictionary[str, asyncio.Lock] = weakref.WeakValueDictionary() @@ -300,17 +304,22 @@ class MemoryConsolidator: return True async def maybe_consolidate_by_tokens(self, session: Session) -> None: - """Loop: archive old messages until prompt fits within half the context window.""" + """Loop: archive old messages until prompt fits within safe budget. + + The budget reserves space for completion tokens and a safety buffer + so the LLM request never exceeds the context window. + """ if not session.messages or self.context_window_tokens <= 0: return lock = self.get_lock(session.key) async with lock: - target = self.context_window_tokens // 2 + budget = self.context_window_tokens - self.max_completion_tokens - self._SAFETY_BUFFER + target = budget // 2 estimated, source = self.estimate_session_prompt_tokens(session) if estimated <= 0: return - if estimated < self.context_window_tokens: + if estimated < budget: logger.debug( "Token consolidation idle {}: {}/{} via {}", session.key, diff --git a/tests/test_loop_consolidation_tokens.py b/tests/test_loop_consolidation_tokens.py index 87d8d29..2f9c2de 100644 --- a/tests/test_loop_consolidation_tokens.py +++ b/tests/test_loop_consolidation_tokens.py @@ -9,8 +9,10 @@ from nanobot.providers.base import LLMResponse def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) -> AgentLoop: + from nanobot.providers.base import GenerationSettings provider = MagicMock() provider.get_default_model.return_value = "test-model" + provider.generation = GenerationSettings(max_tokens=0) provider.estimate_prompt_tokens.return_value = (estimated_tokens, "test-counter") _response = LLMResponse(content="ok", tool_calls=[]) provider.chat_with_retry = AsyncMock(return_value=_response) @@ -24,6 +26,7 @@ def _make_loop(tmp_path, *, estimated_tokens: int, context_window_tokens: int) - context_window_tokens=context_window_tokens, ) loop.tools.get_definitions = MagicMock(return_value=[]) + loop.memory_consolidator._SAFETY_BUFFER = 0 return loop