From bd09cc3e6feaf2b99953194609f7c0e9a09e682e Mon Sep 17 00:00:00 2001
From: coldxiangyu <coldxiangyu@163.com>
Date: Tue, 24 Feb 2026 20:54:30 +0800
Subject: [PATCH] perf: optimize prompt cache hit rate for Anthropic models

Part 1: Make system prompt static
- Move Current Time from system prompt to user message prefix
- System prompt now only changes when config/skills change, not every minute
- Timestamp injected as [YYYY-MM-DD HH:MM (Day) (TZ)] prefix on each user message

Part 2: Add second cache_control breakpoint
- Existing: system message breakpoint (caches static system prompt)
- New: second-to-last message breakpoint (caches conversation history prefix)
- Refactored _apply_cache_control with shared _mark() helper

Before: 0% cache hit rate (system prompt changed every minute)
After: ~90% savings on cached input tokens for multi-turn conversations

Closes #981
---
 nanobot/agent/context.py              | 37 +++++++++++++++++-----
 nanobot/providers/litellm_provider.py | 44 ++++++++++++++++++---------
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/nanobot/agent/context.py b/nanobot/agent/context.py
index be0ec59..ccb1215 100644
--- a/nanobot/agent/context.py
+++ b/nanobot/agent/context.py
@@ -111,13 +111,36 @@ Reply directly with text for conversations. Only use the 'message' tool to send
         channel: str | None = None,
         chat_id: str | None = None,
     ) -> list[dict[str, Any]]:
-        """Build the complete message list for an LLM call."""
-        return [
-            {"role": "system", "content": self.build_system_prompt(skill_names)},
-            *history,
-            {"role": "user", "content": self._build_runtime_context(channel, chat_id)},
-            {"role": "user", "content": self._build_user_content(current_message, media)},
-        ]
+        """
+        Build the complete message list for an LLM call.
+
+        Args:
+            history: Previous conversation messages.
+            current_message: The new user message.
+            skill_names: Optional skills to include.
+            media: Optional list of local file paths for images/media.
+            channel: Current channel (telegram, feishu, etc.).
+            chat_id: Current chat/user ID.
+
+        Returns:
+            List of messages including system prompt.
+        """
+        messages = []
+
+        # System prompt
+        system_prompt = self.build_system_prompt(skill_names)
+        messages.append({"role": "system", "content": system_prompt})
+
+        # History
+        messages.extend(history)
+
+        # Inject current timestamp into user message (keeps system prompt static for caching)
+        # Current message (with optional image attachments)
+        user_content = self._build_user_content(current_message, media)
+        user_content = self._inject_runtime_context(user_content, channel, chat_id)
+        messages.append({"role": "user", "content": user_content})
+
+        return messages
 
     def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
         """Build user message content with optional base64-encoded images."""
diff --git a/nanobot/providers/litellm_provider.py b/nanobot/providers/litellm_provider.py
index 5427d97..c4f528c 100644
--- a/nanobot/providers/litellm_provider.py
+++ b/nanobot/providers/litellm_provider.py
@@ -128,24 +128,40 @@ class LiteLLMProvider(LLMProvider):
         messages: list[dict[str, Any]],
         tools: list[dict[str, Any]] | None,
     ) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
-        """Return copies of messages and tools with cache_control injected."""
-        new_messages = []
-        for msg in messages:
-            if msg.get("role") == "system":
-                content = msg["content"]
-                if isinstance(content, str):
-                    new_content = [{"type": "text", "text": content, "cache_control": {"type": "ephemeral"}}]
-                else:
-                    new_content = list(content)
-                    new_content[-1] = {**new_content[-1], "cache_control": {"type": "ephemeral"}}
-                new_messages.append({**msg, "content": new_content})
-            else:
-                new_messages.append(msg)
+        """Return copies of messages and tools with cache_control injected.
+
+        Two breakpoints are placed:
+        1. System message — caches the static system prompt
+        2. Second-to-last message — caches the conversation history prefix
+        This maximises cache hits across multi-turn conversations.
+        """
+        cache_marker = {"type": "ephemeral"}
+        new_messages = list(messages)
+
+        def _mark(msg: dict[str, Any]) -> dict[str, Any]:
+            content = msg.get("content")
+            if isinstance(content, str):
+                return {**msg, "content": [
+                    {"type": "text", "text": content, "cache_control": cache_marker}
+                ]}
+            elif isinstance(content, list) and content:
+                new_content = list(content)
+                new_content[-1] = {**new_content[-1], "cache_control": cache_marker}
+                return {**msg, "content": new_content}
+            return msg
+
+        # Breakpoint 1: system message
+        if new_messages and new_messages[0].get("role") == "system":
+            new_messages[0] = _mark(new_messages[0])
+
+        # Breakpoint 2: second-to-last message (caches conversation history prefix)
+        if len(new_messages) >= 3:
+            new_messages[-2] = _mark(new_messages[-2])
 
         new_tools = tools
         if tools:
             new_tools = list(tools)
-            new_tools[-1] = {**new_tools[-1], "cache_control": {"type": "ephemeral"}}
+            new_tools[-1] = {**new_tools[-1], "cache_control": cache_marker}
 
         return new_messages, new_tools