perf: optimize prompt cache hit rate for Anthropic models
Part 1: Make system prompt static - Move Current Time from system prompt to user message prefix - System prompt now only changes when config/skills change, not every minute - Timestamp injected as [YYYY-MM-DD HH:MM (Day) (TZ)] prefix on each user message Part 2: Add second cache_control breakpoint - Existing: system message breakpoint (caches static system prompt) - New: second-to-last message breakpoint (caches conversation history prefix) - Refactored _apply_cache_control with shared _mark() helper Before: 0% cache hit rate (system prompt changed every minute) After: ~90% savings on cached input tokens for multi-turn conversations Closes #981
This commit is contained in:
@@ -111,13 +111,36 @@ Reply directly with text for conversations. Only use the 'message' tool to send
|
|||||||
channel: str | None = None,
|
channel: str | None = None,
|
||||||
chat_id: str | None = None,
|
chat_id: str | None = None,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Build the complete message list for an LLM call."""
|
"""
|
||||||
return [
|
Build the complete message list for an LLM call.
|
||||||
{"role": "system", "content": self.build_system_prompt(skill_names)},
|
|
||||||
*history,
|
Args:
|
||||||
{"role": "user", "content": self._build_runtime_context(channel, chat_id)},
|
history: Previous conversation messages.
|
||||||
{"role": "user", "content": self._build_user_content(current_message, media)},
|
current_message: The new user message.
|
||||||
]
|
skill_names: Optional skills to include.
|
||||||
|
media: Optional list of local file paths for images/media.
|
||||||
|
channel: Current channel (telegram, feishu, etc.).
|
||||||
|
chat_id: Current chat/user ID.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of messages including system prompt.
|
||||||
|
"""
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
# System prompt
|
||||||
|
system_prompt = self.build_system_prompt(skill_names)
|
||||||
|
messages.append({"role": "system", "content": system_prompt})
|
||||||
|
|
||||||
|
# History
|
||||||
|
messages.extend(history)
|
||||||
|
|
||||||
|
# Inject current timestamp into user message (keeps system prompt static for caching)
|
||||||
|
# Current message (with optional image attachments)
|
||||||
|
user_content = self._build_user_content(current_message, media)
|
||||||
|
user_content = self._inject_runtime_context(user_content, channel, chat_id)
|
||||||
|
messages.append({"role": "user", "content": user_content})
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
|
def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
|
||||||
"""Build user message content with optional base64-encoded images."""
|
"""Build user message content with optional base64-encoded images."""
|
||||||
|
|||||||
@@ -128,24 +128,40 @@ class LiteLLMProvider(LLMProvider):
|
|||||||
messages: list[dict[str, Any]],
|
messages: list[dict[str, Any]],
|
||||||
tools: list[dict[str, Any]] | None,
|
tools: list[dict[str, Any]] | None,
|
||||||
) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
|
) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
|
||||||
"""Return copies of messages and tools with cache_control injected."""
|
"""Return copies of messages and tools with cache_control injected.
|
||||||
new_messages = []
|
|
||||||
for msg in messages:
|
Two breakpoints are placed:
|
||||||
if msg.get("role") == "system":
|
1. System message — caches the static system prompt
|
||||||
content = msg["content"]
|
2. Second-to-last message — caches the conversation history prefix
|
||||||
if isinstance(content, str):
|
This maximises cache hits across multi-turn conversations.
|
||||||
new_content = [{"type": "text", "text": content, "cache_control": {"type": "ephemeral"}}]
|
"""
|
||||||
else:
|
cache_marker = {"type": "ephemeral"}
|
||||||
new_content = list(content)
|
new_messages = list(messages)
|
||||||
new_content[-1] = {**new_content[-1], "cache_control": {"type": "ephemeral"}}
|
|
||||||
new_messages.append({**msg, "content": new_content})
|
def _mark(msg: dict[str, Any]) -> dict[str, Any]:
|
||||||
else:
|
content = msg.get("content")
|
||||||
new_messages.append(msg)
|
if isinstance(content, str):
|
||||||
|
return {**msg, "content": [
|
||||||
|
{"type": "text", "text": content, "cache_control": cache_marker}
|
||||||
|
]}
|
||||||
|
elif isinstance(content, list) and content:
|
||||||
|
new_content = list(content)
|
||||||
|
new_content[-1] = {**new_content[-1], "cache_control": cache_marker}
|
||||||
|
return {**msg, "content": new_content}
|
||||||
|
return msg
|
||||||
|
|
||||||
|
# Breakpoint 1: system message
|
||||||
|
if new_messages and new_messages[0].get("role") == "system":
|
||||||
|
new_messages[0] = _mark(new_messages[0])
|
||||||
|
|
||||||
|
# Breakpoint 2: second-to-last message (caches conversation history prefix)
|
||||||
|
if len(new_messages) >= 3:
|
||||||
|
new_messages[-2] = _mark(new_messages[-2])
|
||||||
|
|
||||||
new_tools = tools
|
new_tools = tools
|
||||||
if tools:
|
if tools:
|
||||||
new_tools = list(tools)
|
new_tools = list(tools)
|
||||||
new_tools[-1] = {**new_tools[-1], "cache_control": {"type": "ephemeral"}}
|
new_tools[-1] = {**new_tools[-1], "cache_control": cache_marker}
|
||||||
|
|
||||||
return new_messages, new_tools
|
return new_messages, new_tools
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user