fix: merge consecutive user messages into single message

Some LLM providers (Minimax, Dashscope) strictly reject consecutive messages with the same role. build_messages() was emitting two separate user messages back-to-back: the runtime context and the actual user content. Merge them into a single user message, handling both plain text and multimodal (image) content. Update _save_turn() to strip the runtime context prefix from the merged message when persisting to session history. Fixes #1414 Fixes #1344
2026-03-03 00:59:58 -03:00
parent 3c79404194
commit ad99d5aaa0
2 changed files with 27 additions and 9 deletions
--- a/nanobot/agent/context.py
+++ b/nanobot/agent/context.py
@@ -112,11 +112,20 @@ Reply directly with text for conversations. Only use the 'message' tool to send
        chat_id: str | None = None,
    ) -> list[dict[str, Any]]:
        """Build the complete message list for an LLM call."""
        runtime_ctx = self._build_runtime_context(channel, chat_id)
        user_content = self._build_user_content(current_message, media)
        # Merge runtime context and user content into a single user message
        # to avoid consecutive same-role messages that some providers reject.
        if isinstance(user_content, str):
            merged = f"{runtime_ctx}\n\n{user_content}"
        else:
            merged = [{"type": "text", "text": runtime_ctx}] + user_content
        return [
            {"role": "system", "content": self.build_system_prompt(skill_names)},
            *history,
-            {"role": "user", "content": self._build_runtime_context(channel, chat_id)},
+            {"role": "user", "content": merged},
            {"role": "user", "content": self._build_user_content(current_message, media)},
        ]
    def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -464,14 +464,23 @@ class AgentLoop:
                entry["content"] = content[:self._TOOL_RESULT_MAX_CHARS] + "\n... (truncated)"
            elif role == "user":
                if isinstance(content, str) and content.startswith(ContextBuilder._RUNTIME_CONTEXT_TAG):
-                    continue
+                    # Strip the runtime-context prefix, keep only the user text.
                    parts = content.split("\n\n", 1)
                    if len(parts) > 1 and parts[1].strip():
                        entry["content"] = parts[1]
                    else:
                        continue
                if isinstance(content, list):
-                    entry["content"] = [
+                    filtered = []
-                        {"type": "text", "text": "[image]"} if (
+                    for c in content:
-                            c.get("type") == "image_url"
+                        if c.get("type") == "text" and isinstance(c.get("text"), str) and c["text"].startswith(ContextBuilder._RUNTIME_CONTEXT_TAG):
-                            and c.get("image_url", {}).get("url", "").startswith("data:image/")
+                            continue  # Strip runtime context from multimodal messages
-                        ) else c for c in content
+                        if (c.get("type") == "image_url"
-                    ]
+                                and c.get("image_url", {}).get("url", "").startswith("data:image/")):
                            filtered.append({"type": "text", "text": "[image]"})
                        else:
                            filtered.append(c)
                    entry["content"] = filtered
            entry.setdefault("timestamp", datetime.now().isoformat())
            session.messages.append(entry)
        session.updated_at = datetime.now()