diff --git a/nanobot/agent/context.py b/nanobot/agent/context.py index 3fe11aa..71d3a3d 100644 --- a/nanobot/agent/context.py +++ b/nanobot/agent/context.py @@ -159,7 +159,11 @@ Reply directly with text for conversations. Only use the 'message' tool to send if not mime or not mime.startswith("image/"): continue b64 = base64.b64encode(raw).decode() - images.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}) + images.append({ + "type": "image_url", + "image_url": {"url": f"data:{mime};base64,{b64}"}, + "_meta": {"path": str(p)}, + }) if not images: return text diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index 34f5baa..1d85f62 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -480,7 +480,9 @@ class AgentLoop: continue # Strip runtime context from multimodal messages if (c.get("type") == "image_url" and c.get("image_url", {}).get("url", "").startswith("data:image/")): - filtered.append({"type": "text", "text": "[image]"}) + path = (c.get("_meta") or {}).get("path", "") + placeholder = f"[image: {path}]" if path else "[image]" + filtered.append({"type": "text", "text": placeholder}) else: filtered.append(c) if not filtered: diff --git a/nanobot/providers/base.py b/nanobot/providers/base.py index 8b6956c..8f9b2ba 100644 --- a/nanobot/providers/base.py +++ b/nanobot/providers/base.py @@ -89,14 +89,6 @@ class LLMProvider(ABC): "server error", "temporarily unavailable", ) - _IMAGE_UNSUPPORTED_MARKERS = ( - "image_url is only supported", - "does not support image", - "images are not supported", - "image input is not supported", - "image_url is not supported", - "unsupported image input", - ) _SENTINEL = object() @@ -107,11 +99,7 @@ class LLMProvider(ABC): @staticmethod def _sanitize_empty_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: - """Replace empty text content that causes provider 400 errors. - - Empty content can appear when MCP tools return nothing. Most providers - reject empty-string content or empty text blocks in list content. - """ + """Sanitize message content: fix empty blocks, strip internal _meta fields.""" result: list[dict[str, Any]] = [] for msg in messages: content = msg.get("content") @@ -123,18 +111,25 @@ class LLMProvider(ABC): continue if isinstance(content, list): - filtered = [ - item for item in content - if not ( + new_items: list[Any] = [] + changed = False + for item in content: + if ( isinstance(item, dict) and item.get("type") in ("text", "input_text", "output_text") and not item.get("text") - ) - ] - if len(filtered) != len(content): + ): + changed = True + continue + if isinstance(item, dict) and "_meta" in item: + new_items.append({k: v for k, v in item.items() if k != "_meta"}) + changed = True + else: + new_items.append(item) + if changed: clean = dict(msg) - if filtered: - clean["content"] = filtered + if new_items: + clean["content"] = new_items elif msg.get("role") == "assistant" and msg.get("tool_calls"): clean["content"] = None else: @@ -197,11 +192,6 @@ class LLMProvider(ABC): err = (content or "").lower() return any(marker in err for marker in cls._TRANSIENT_ERROR_MARKERS) - @classmethod - def _is_image_unsupported_error(cls, content: str | None) -> bool: - err = (content or "").lower() - return any(marker in err for marker in cls._IMAGE_UNSUPPORTED_MARKERS) - @staticmethod def _strip_image_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]] | None: """Replace image_url blocks with text placeholder. Returns None if no images found.""" @@ -213,7 +203,9 @@ class LLMProvider(ABC): new_content = [] for b in content: if isinstance(b, dict) and b.get("type") == "image_url": - new_content.append({"type": "text", "text": "[image omitted]"}) + path = (b.get("_meta") or {}).get("path", "") + placeholder = f"[image: {path}]" if path else "[image omitted]" + new_content.append({"type": "text", "text": placeholder}) found = True else: new_content.append(b) @@ -267,11 +259,10 @@ class LLMProvider(ABC): return response if not self._is_transient_error(response.content): - if self._is_image_unsupported_error(response.content): - stripped = self._strip_image_content(messages) - if stripped is not None: - logger.warning("Model does not support image input, retrying without images") - return await self._safe_chat(**{**kw, "messages": stripped}) + stripped = self._strip_image_content(messages) + if stripped is not None: + logger.warning("Non-transient LLM error with image content, retrying without images") + return await self._safe_chat(**{**kw, "messages": stripped}) return response logger.warning( diff --git a/tests/test_loop_save_turn.py b/tests/test_loop_save_turn.py index 25ba88b..aed7653 100644 --- a/tests/test_loop_save_turn.py +++ b/tests/test_loop_save_turn.py @@ -22,11 +22,30 @@ def test_save_turn_skips_multimodal_user_when_only_runtime_context() -> None: assert session.messages == [] -def test_save_turn_keeps_image_placeholder_after_runtime_strip() -> None: +def test_save_turn_keeps_image_placeholder_with_path_after_runtime_strip() -> None: loop = _mk_loop() session = Session(key="test:image") runtime = ContextBuilder._RUNTIME_CONTEXT_TAG + "\nCurrent Time: now (UTC)" + loop._save_turn( + session, + [{ + "role": "user", + "content": [ + {"type": "text", "text": runtime}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}, "_meta": {"path": "/media/feishu/photo.jpg"}}, + ], + }], + skip=0, + ) + assert session.messages[0]["content"] == [{"type": "text", "text": "[image: /media/feishu/photo.jpg]"}] + + +def test_save_turn_keeps_image_placeholder_without_meta() -> None: + loop = _mk_loop() + session = Session(key="test:image-no-meta") + runtime = ContextBuilder._RUNTIME_CONTEXT_TAG + "\nCurrent Time: now (UTC)" + loop._save_turn( session, [{ diff --git a/tests/test_provider_retry.py b/tests/test_provider_retry.py index 6f2c165..d732054 100644 --- a/tests/test_provider_retry.py +++ b/tests/test_provider_retry.py @@ -126,10 +126,17 @@ async def test_chat_with_retry_explicit_override_beats_defaults() -> None: # --------------------------------------------------------------------------- -# Image-unsupported fallback tests +# Image fallback tests # --------------------------------------------------------------------------- _IMAGE_MSG = [ + {"role": "user", "content": [ + {"type": "text", "text": "describe this"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}, "_meta": {"path": "/media/test.png"}}, + ]}, +] + +_IMAGE_MSG_NO_META = [ {"role": "user", "content": [ {"type": "text", "text": "describe this"}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, @@ -138,13 +145,10 @@ _IMAGE_MSG = [ @pytest.mark.asyncio -async def test_image_unsupported_error_retries_without_images() -> None: - """If the model rejects image_url, retry once with images stripped.""" +async def test_non_transient_error_with_images_retries_without_images() -> None: + """Any non-transient error retries once with images stripped when images are present.""" provider = ScriptedProvider([ - LLMResponse( - content="Invalid content type. image_url is only supported by certain models", - finish_reason="error", - ), + LLMResponse(content="API调用参数有误,请检查文档", finish_reason="error"), LLMResponse(content="ok, no image"), ]) @@ -157,17 +161,14 @@ async def test_image_unsupported_error_retries_without_images() -> None: content = msg.get("content") if isinstance(content, list): assert all(b.get("type") != "image_url" for b in content) - assert any("[image omitted]" in (b.get("text") or "") for b in content) + assert any("[image: /media/test.png]" in (b.get("text") or "") for b in content) @pytest.mark.asyncio -async def test_image_unsupported_error_no_retry_without_image_content() -> None: - """If messages don't contain image_url blocks, don't retry on image error.""" +async def test_non_transient_error_without_images_no_retry() -> None: + """Non-transient errors without image content are returned immediately.""" provider = ScriptedProvider([ - LLMResponse( - content="image_url is only supported by certain models", - finish_reason="error", - ), + LLMResponse(content="401 unauthorized", finish_reason="error"), ]) response = await provider.chat_with_retry( @@ -179,31 +180,34 @@ async def test_image_unsupported_error_no_retry_without_image_content() -> None: @pytest.mark.asyncio -async def test_image_unsupported_fallback_returns_error_on_second_failure() -> None: +async def test_image_fallback_returns_error_on_second_failure() -> None: """If the image-stripped retry also fails, return that error.""" provider = ScriptedProvider([ - LLMResponse( - content="does not support image input", - finish_reason="error", - ), - LLMResponse(content="some other error", finish_reason="error"), + LLMResponse(content="some model error", finish_reason="error"), + LLMResponse(content="still failing", finish_reason="error"), ]) response = await provider.chat_with_retry(messages=_IMAGE_MSG) assert provider.calls == 2 - assert response.content == "some other error" + assert response.content == "still failing" assert response.finish_reason == "error" @pytest.mark.asyncio -async def test_non_image_error_does_not_trigger_image_fallback() -> None: - """Regular non-transient errors must not trigger image stripping.""" +async def test_image_fallback_without_meta_uses_default_placeholder() -> None: + """When _meta is absent, fallback placeholder is '[image omitted]'.""" provider = ScriptedProvider([ - LLMResponse(content="401 unauthorized", finish_reason="error"), + LLMResponse(content="error", finish_reason="error"), + LLMResponse(content="ok"), ]) - response = await provider.chat_with_retry(messages=_IMAGE_MSG) + response = await provider.chat_with_retry(messages=_IMAGE_MSG_NO_META) - assert provider.calls == 1 - assert response.content == "401 unauthorized" + assert response.content == "ok" + assert provider.calls == 2 + msgs_on_retry = provider.last_kwargs["messages"] + for msg in msgs_on_retry: + content = msg.get("content") + if isinstance(content, list): + assert any("[image omitted]" in (b.get("text") or "") for b in content)