feat(voice): add persona-driven tts and qq local media upload

This commit is contained in:
Hua
2026-03-23 11:10:27 +08:00
parent b1a08f3bb9
commit d838a12b56
9 changed files with 882 additions and 96 deletions

View File

@@ -276,7 +276,6 @@ async def test_send_local_media_under_out_dir_uses_c2c_file_api(
"params": {"openid": "user123"},
"json": {
"file_type": 1,
"url": "https://files.example.com/out/demo.png",
"file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
"srv_send_msg": False,
},
@@ -338,7 +337,6 @@ async def test_send_local_media_in_nested_out_path_uses_relative_url(
"params": {"openid": "user123"},
"json": {
"file_type": 1,
"url": "https://files.example.com/qq-media/shots/github.png",
"file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
"srv_send_msg": False,
},
@@ -408,8 +406,7 @@ async def test_send_local_media_outside_out_falls_back_to_text_notice(
@pytest.mark.asyncio
async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upload_fails(
monkeypatch,
async def test_send_local_media_with_media_base_url_still_falls_back_to_text_notice_when_file_data_upload_fails(
tmp_path,
) -> None:
workspace = tmp_path / "workspace"
@@ -431,7 +428,6 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
)
channel._client = _FakeClient()
channel._client.api.raise_on_raw_file_upload = True
monkeypatch.setattr("nanobot.channels.qq.validate_url_target", lambda url: (True, ""))
await channel.send(
OutboundMessage(
@@ -443,20 +439,12 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
)
)
assert channel._client.api.c2c_file_calls == [
{
"openid": "user123",
"file_type": 1,
"url": "https://files.example.com/out/demo.png",
"srv_send_msg": False,
}
]
assert channel._client.api.c2c_file_calls == []
assert channel._client.api.c2c_calls == [
{
"openid": "user123",
"msg_type": 7,
"content": "hello",
"media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
"msg_type": 0,
"content": "hello\n[Failed to send: demo.png - QQ local file_data upload failed]",
"msg_id": "msg1",
"msg_seq": 2,
}
@@ -596,7 +584,60 @@ async def test_send_non_image_media_from_out_falls_back_to_text_notice(
{
"openid": "user123",
"msg_type": 0,
"content": "hello\n[Failed to send: note.txt - local delivery media must be an image]",
"content": (
"hello\n[Failed to send: note.txt - local delivery media must be an image, .mp4 video, "
"or .silk voice]"
),
"msg_id": "msg1",
"msg_seq": 2,
}
]
@pytest.mark.asyncio
async def test_send_local_silk_voice_uses_file_type_three_direct_upload(tmp_path) -> None:
workspace = tmp_path / "workspace"
workspace.mkdir()
out_dir = workspace / "out"
out_dir.mkdir()
source = out_dir / "reply.silk"
source.write_bytes(b"fake-silk")
channel = QQChannel(
QQConfig(app_id="app", secret="secret", allow_from=["*"]),
MessageBus(),
workspace=workspace,
)
channel._client = _FakeClient()
await channel.send(
OutboundMessage(
channel="qq",
chat_id="user123",
content="hello",
media=[str(source)],
metadata={"message_id": "msg1"},
)
)
assert channel._client.api.raw_file_upload_calls == [
{
"method": "POST",
"path": "/v2/users/{openid}/files",
"params": {"openid": "user123"},
"json": {
"file_type": 3,
"file_data": b64encode(b"fake-silk").decode("ascii"),
"srv_send_msg": False,
},
}
]
assert channel._client.api.c2c_calls == [
{
"openid": "user123",
"msg_type": 7,
"content": "hello",
"media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
"msg_id": "msg1",
"msg_seq": 2,
}

321
tests/test_voice_reply.py Normal file
View File

@@ -0,0 +1,321 @@
"""Tests for optional outbound voice replies."""
from __future__ import annotations
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from nanobot.bus.events import InboundMessage
from nanobot.config.schema import Config
from nanobot.providers.base import LLMResponse
from nanobot.providers.speech import OpenAISpeechProvider
def _make_loop(workspace: Path, *, channels_payload: dict | None = None):
"""Create an AgentLoop with lightweight mocks and configurable channels."""
from nanobot.agent.loop import AgentLoop
from nanobot.bus.queue import MessageBus
bus = MessageBus()
provider = MagicMock()
provider.get_default_model.return_value = "test-model"
provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[]))
provider.api_key = ""
provider.api_base = None
config = Config.model_validate({"channels": channels_payload or {}})
with patch("nanobot.agent.loop.SubagentManager"):
loop = AgentLoop(
bus=bus,
provider=provider,
workspace=workspace,
channels_config=config.channels,
)
return loop, provider
def test_voice_reply_config_parses_camel_case() -> None:
config = Config.model_validate(
{
"channels": {
"voiceReply": {
"enabled": True,
"channels": ["telegram/main"],
"model": "gpt-4o-mini-tts",
"voice": "alloy",
"instructions": "sound calm",
"speed": 1.1,
"responseFormat": "mp3",
"apiKey": "tts-key",
"url": "https://tts.example.com/v1",
}
}
}
)
voice_reply = config.channels.voice_reply
assert voice_reply.enabled is True
assert voice_reply.channels == ["telegram/main"]
assert voice_reply.instructions == "sound calm"
assert voice_reply.speed == 1.1
assert voice_reply.response_format == "mp3"
assert voice_reply.api_key == "tts-key"
assert voice_reply.api_base == "https://tts.example.com/v1"
def test_openai_speech_provider_accepts_direct_endpoint_url() -> None:
provider = OpenAISpeechProvider(
api_key="tts-key",
api_base="https://tts.example.com/v1/audio/speech",
)
assert provider._speech_url() == "https://tts.example.com/v1/audio/speech"
@pytest.mark.asyncio
async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
(tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8")
loop, provider = _make_loop(
tmp_path,
channels_payload={
"voiceReply": {
"enabled": True,
"channels": ["telegram"],
"instructions": "keep the delivery warm",
"speed": 1.05,
"responseFormat": "opus",
}
},
)
provider.api_key = "provider-tts-key"
provider.api_base = "https://provider.example.com/v1"
captured: dict[str, str | float | None] = {}
async def fake_synthesize_to_file(
self,
text: str,
*,
model: str,
voice: str,
instructions: str | None,
speed: float | None,
response_format: str,
output_path: str | Path,
) -> Path:
path = Path(output_path)
path.write_bytes(b"voice-bytes")
captured["api_key"] = self.api_key
captured["api_base"] = self.api_base
captured["text"] = text
captured["model"] = model
captured["voice"] = voice
captured["instructions"] = instructions
captured["speed"] = speed
captured["response_format"] = response_format
return path
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
response = await loop._process_message(
InboundMessage(
channel="telegram/main",
sender_id="user-1",
chat_id="chat-1",
content="hello",
)
)
assert response is not None
assert response.content == "hello"
assert len(response.media) == 1
media_path = Path(response.media[0])
assert media_path.parent == tmp_path / "out" / "voice"
assert media_path.suffix == ".ogg"
assert media_path.read_bytes() == b"voice-bytes"
assert captured == {
"api_key": "provider-tts-key",
"api_base": "https://provider.example.com/v1",
"text": "hello",
"model": "gpt-4o-mini-tts",
"voice": "alloy",
"instructions": (
"Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, "
"and emotional style while keeping the reply natural and conversational. keep the "
"delivery warm Persona guidance: default soul voice"
),
"speed": 1.05,
"response_format": "opus",
}
@pytest.mark.asyncio
async def test_persona_voice_settings_override_global_voice_profile(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
(tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8")
persona_dir = tmp_path / "personas" / "coder"
persona_dir.mkdir(parents=True)
(persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8")
(persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8")
(persona_dir / "VOICE.json").write_text(
'{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}',
encoding="utf-8",
)
loop, provider = _make_loop(
tmp_path,
channels_payload={
"voiceReply": {
"enabled": True,
"channels": ["telegram"],
"voice": "alloy",
"instructions": "keep the pacing steady",
}
},
)
provider.api_key = "provider-tts-key"
session = loop.sessions.get_or_create("telegram:chat-1")
session.metadata["persona"] = "coder"
loop.sessions.save(session)
captured: dict[str, str | float | None] = {}
async def fake_synthesize_to_file(
self,
text: str,
*,
model: str,
voice: str,
instructions: str | None,
speed: float | None,
response_format: str,
output_path: str | Path,
) -> Path:
path = Path(output_path)
path.write_bytes(b"voice-bytes")
captured["voice"] = voice
captured["instructions"] = instructions
captured["speed"] = speed
return path
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
response = await loop._process_message(
InboundMessage(
channel="telegram",
sender_id="user-1",
chat_id="chat-1",
content="hello",
)
)
assert response is not None
assert len(response.media) == 1
assert captured["voice"] == "nova"
assert captured["speed"] == 1.2
assert isinstance(captured["instructions"], str)
assert "active persona 'coder'" in captured["instructions"]
assert "keep the pacing steady" in captured["instructions"]
assert "use a crisp and confident delivery" in captured["instructions"]
assert "speak like a sharp engineer" in captured["instructions"]
assert "be concise and technical" in captured["instructions"]
@pytest.mark.asyncio
async def test_qq_voice_reply_config_keeps_text_only(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
loop, provider = _make_loop(
tmp_path,
channels_payload={
"voiceReply": {
"enabled": True,
"channels": ["qq"],
"apiKey": "tts-key",
}
},
)
provider.api_key = "provider-tts-key"
synthesize = AsyncMock()
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize)
response = await loop._process_message(
InboundMessage(
channel="qq",
sender_id="user-1",
chat_id="chat-1",
content="hello",
)
)
assert response is not None
assert response.content == "hello"
assert response.media == []
synthesize.assert_not_awaited()
@pytest.mark.asyncio
async def test_qq_voice_reply_uses_silk_when_configured(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
loop, provider = _make_loop(
tmp_path,
channels_payload={
"voiceReply": {
"enabled": True,
"channels": ["qq"],
"apiKey": "tts-key",
"responseFormat": "silk",
}
},
)
provider.api_key = "provider-tts-key"
captured: dict[str, str | None] = {}
async def fake_synthesize_to_file(
self,
text: str,
*,
model: str,
voice: str,
instructions: str | None,
speed: float | None,
response_format: str,
output_path: str | Path,
) -> Path:
path = Path(output_path)
path.write_bytes(b"fake-silk")
captured["response_format"] = response_format
return path
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
response = await loop._process_message(
InboundMessage(
channel="qq",
sender_id="user-1",
chat_id="chat-1",
content="hello",
)
)
assert response is not None
assert response.content == "hello"
assert len(response.media) == 1
assert Path(response.media[0]).suffix == ".silk"
assert captured["response_format"] == "silk"