feat: add Anthropic prompt caching via cache_control

Inject cache_control: {"type": "ephemeral"} on the system message and
last tool definition for providers that support prompt caching. Adds
supports_prompt_caching flag to ProviderSpec (enabled for Anthropic only)
and skips caching when routing through a gateway.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
tercerapersona
2026-02-19 11:05:22 -03:00
parent 8de36d398f
commit 3b4763b3f9
2 changed files with 45 additions and 2 deletions

View File

@@ -93,6 +93,41 @@ class LiteLLMProvider(LLMProvider):
return model
def _supports_cache_control(self, model: str) -> bool:
"""Return True when the provider supports cache_control on content blocks."""
if self._gateway is not None:
return False
spec = find_by_model(model)
return spec is not None and spec.supports_prompt_caching
def _apply_cache_control(
self,
messages: list[dict[str, Any]],
tools: list[dict[str, Any]] | None,
) -> tuple[list[dict[str, Any]], list[dict[str, Any]] | None]:
"""Return copies of messages and tools with cache_control injected."""
# Transform the system message
new_messages = []
for msg in messages:
if msg.get("role") == "system":
content = msg["content"]
if isinstance(content, str):
new_content = [{"type": "text", "text": content, "cache_control": {"type": "ephemeral"}}]
else:
new_content = list(content)
new_content[-1] = {**new_content[-1], "cache_control": {"type": "ephemeral"}}
new_messages.append({**msg, "content": new_content})
else:
new_messages.append(msg)
# Add cache_control to the last tool definition
new_tools = tools
if tools:
new_tools = list(tools)
new_tools[-1] = {**new_tools[-1], "cache_control": {"type": "ephemeral"}}
return new_messages, new_tools
def _apply_model_overrides(self, model: str, kwargs: dict[str, Any]) -> None:
"""Apply model-specific parameter overrides from the registry."""
model_lower = model.lower()
@@ -124,8 +159,12 @@ class LiteLLMProvider(LLMProvider):
Returns:
LLMResponse with content and/or tool calls.
"""
model = self._resolve_model(model or self.default_model)
original_model = model or self.default_model
model = self._resolve_model(original_model)
if self._supports_cache_control(original_model):
messages, tools = self._apply_cache_control(messages, tools)
# Clamp max_tokens to at least 1 — negative or zero values cause
# LiteLLM to reject the request with "max_tokens must be at least 1".
max_tokens = max(1, max_tokens)