fix: merge consecutive user messages into single message
Some LLM providers (Minimax, Dashscope) strictly reject consecutive messages with the same role. build_messages() was emitting two separate user messages back-to-back: the runtime context and the actual user content. Merge them into a single user message, handling both plain text and multimodal (image) content. Update _save_turn() to strip the runtime context prefix from the merged message when persisting to session history. Fixes #1414 Fixes #1344
This commit is contained in:
@@ -112,11 +112,20 @@ Reply directly with text for conversations. Only use the 'message' tool to send
|
||||
chat_id: str | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Build the complete message list for an LLM call."""
|
||||
runtime_ctx = self._build_runtime_context(channel, chat_id)
|
||||
user_content = self._build_user_content(current_message, media)
|
||||
|
||||
# Merge runtime context and user content into a single user message
|
||||
# to avoid consecutive same-role messages that some providers reject.
|
||||
if isinstance(user_content, str):
|
||||
merged = f"{runtime_ctx}\n\n{user_content}"
|
||||
else:
|
||||
merged = [{"type": "text", "text": runtime_ctx}] + user_content
|
||||
|
||||
return [
|
||||
{"role": "system", "content": self.build_system_prompt(skill_names)},
|
||||
*history,
|
||||
{"role": "user", "content": self._build_runtime_context(channel, chat_id)},
|
||||
{"role": "user", "content": self._build_user_content(current_message, media)},
|
||||
{"role": "user", "content": merged},
|
||||
]
|
||||
|
||||
def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
|
||||
|
||||
@@ -464,14 +464,23 @@ class AgentLoop:
|
||||
entry["content"] = content[:self._TOOL_RESULT_MAX_CHARS] + "\n... (truncated)"
|
||||
elif role == "user":
|
||||
if isinstance(content, str) and content.startswith(ContextBuilder._RUNTIME_CONTEXT_TAG):
|
||||
continue
|
||||
# Strip the runtime-context prefix, keep only the user text.
|
||||
parts = content.split("\n\n", 1)
|
||||
if len(parts) > 1 and parts[1].strip():
|
||||
entry["content"] = parts[1]
|
||||
else:
|
||||
continue
|
||||
if isinstance(content, list):
|
||||
entry["content"] = [
|
||||
{"type": "text", "text": "[image]"} if (
|
||||
c.get("type") == "image_url"
|
||||
and c.get("image_url", {}).get("url", "").startswith("data:image/")
|
||||
) else c for c in content
|
||||
]
|
||||
filtered = []
|
||||
for c in content:
|
||||
if c.get("type") == "text" and isinstance(c.get("text"), str) and c["text"].startswith(ContextBuilder._RUNTIME_CONTEXT_TAG):
|
||||
continue # Strip runtime context from multimodal messages
|
||||
if (c.get("type") == "image_url"
|
||||
and c.get("image_url", {}).get("url", "").startswith("data:image/")):
|
||||
filtered.append({"type": "text", "text": "[image]"})
|
||||
else:
|
||||
filtered.append(c)
|
||||
entry["content"] = filtered
|
||||
entry.setdefault("timestamp", datetime.now().isoformat())
|
||||
session.messages.append(entry)
|
||||
session.updated_at = datetime.now()
|
||||
|
||||
Reference in New Issue
Block a user