From de0b5b3d91392263ebd061a3c3e365b0e823998d Mon Sep 17 00:00:00 2001 From: coldxiangyu Date: Thu, 12 Mar 2026 08:17:42 +0800 Subject: [PATCH] fix: filter image_url for non-vision models at provider layer - Add field to ProviderSpec (default True) - Add and methods in LiteLLMProvider - Filter image_url content blocks in before sending to non-vision models - Reverts session-layer filtering from original PR (wrong layer) This fixes the issue where switching from Claude (vision-capable) to non-vision models (e.g., Baidu Qianfan) causes API errors due to unsupported image_url content blocks. The provider layer is the correct place for this filtering because: 1. It has access to model/provider capabilities 2. It only affects non-vision models 3. It preserves session layer purity (storage should not know about model capabilities) --- nanobot/providers/litellm_provider.py | 30 +++++++++++++++++++++++++++ nanobot/providers/registry.py | 3 +++ 2 files changed, 33 insertions(+) diff --git a/nanobot/providers/litellm_provider.py b/nanobot/providers/litellm_provider.py index d14e4c0..3dece89 100644 --- a/nanobot/providers/litellm_provider.py +++ b/nanobot/providers/litellm_provider.py @@ -124,6 +124,32 @@ class LiteLLMProvider(LLMProvider): spec = find_by_model(model) return spec is not None and spec.supports_prompt_caching + def _supports_vision(self, model: str) -> bool: + """Return True when the provider supports vision/image inputs.""" + if self._gateway is not None: + return self._gateway.supports_vision + spec = find_by_model(model) + return spec is None or spec.supports_vision # default True for unknown providers + + @staticmethod + def _filter_image_url(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Replace image_url content blocks with [image] placeholder for non-vision models.""" + filtered = [] + for msg in messages: + content = msg.get("content") + if isinstance(content, list): + new_content = [] + for block in content: + if isinstance(block, dict) and block.get("type") == "image_url": + # Replace image with placeholder text + new_content.append({"type": "text", "text": "[image]"}) + else: + new_content.append(block) + filtered.append({**msg, "content": new_content}) + else: + filtered.append(msg) + return filtered + def _apply_cache_control( self, messages: list[dict[str, Any]], @@ -234,6 +260,10 @@ class LiteLLMProvider(LLMProvider): model = self._resolve_model(original_model) extra_msg_keys = self._extra_msg_keys(original_model, model) + # Filter image_url for non-vision models + if not self._supports_vision(original_model): + messages = self._filter_image_url(messages) + if self._supports_cache_control(original_model): messages, tools = self._apply_cache_control(messages, tools) diff --git a/nanobot/providers/registry.py b/nanobot/providers/registry.py index 42c1d24..a45f14a 100644 --- a/nanobot/providers/registry.py +++ b/nanobot/providers/registry.py @@ -61,6 +61,9 @@ class ProviderSpec: # Provider supports cache_control on content blocks (e.g. Anthropic prompt caching) supports_prompt_caching: bool = False + # Provider supports vision/image inputs (most modern models do) + supports_vision: bool = True + @property def label(self) -> str: return self.display_name or self.name.title()