feat(voice): add persona-driven tts and qq local media upload
This commit is contained in:
@@ -276,7 +276,6 @@ async def test_send_local_media_under_out_dir_uses_c2c_file_api(
|
||||
"params": {"openid": "user123"},
|
||||
"json": {
|
||||
"file_type": 1,
|
||||
"url": "https://files.example.com/out/demo.png",
|
||||
"file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
|
||||
"srv_send_msg": False,
|
||||
},
|
||||
@@ -338,7 +337,6 @@ async def test_send_local_media_in_nested_out_path_uses_relative_url(
|
||||
"params": {"openid": "user123"},
|
||||
"json": {
|
||||
"file_type": 1,
|
||||
"url": "https://files.example.com/qq-media/shots/github.png",
|
||||
"file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
|
||||
"srv_send_msg": False,
|
||||
},
|
||||
@@ -408,8 +406,7 @@ async def test_send_local_media_outside_out_falls_back_to_text_notice(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upload_fails(
|
||||
monkeypatch,
|
||||
async def test_send_local_media_with_media_base_url_still_falls_back_to_text_notice_when_file_data_upload_fails(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
@@ -431,7 +428,6 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
|
||||
)
|
||||
channel._client = _FakeClient()
|
||||
channel._client.api.raise_on_raw_file_upload = True
|
||||
monkeypatch.setattr("nanobot.channels.qq.validate_url_target", lambda url: (True, ""))
|
||||
|
||||
await channel.send(
|
||||
OutboundMessage(
|
||||
@@ -443,20 +439,12 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
|
||||
)
|
||||
)
|
||||
|
||||
assert channel._client.api.c2c_file_calls == [
|
||||
{
|
||||
"openid": "user123",
|
||||
"file_type": 1,
|
||||
"url": "https://files.example.com/out/demo.png",
|
||||
"srv_send_msg": False,
|
||||
}
|
||||
]
|
||||
assert channel._client.api.c2c_file_calls == []
|
||||
assert channel._client.api.c2c_calls == [
|
||||
{
|
||||
"openid": "user123",
|
||||
"msg_type": 7,
|
||||
"content": "hello",
|
||||
"media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
|
||||
"msg_type": 0,
|
||||
"content": "hello\n[Failed to send: demo.png - QQ local file_data upload failed]",
|
||||
"msg_id": "msg1",
|
||||
"msg_seq": 2,
|
||||
}
|
||||
@@ -596,7 +584,60 @@ async def test_send_non_image_media_from_out_falls_back_to_text_notice(
|
||||
{
|
||||
"openid": "user123",
|
||||
"msg_type": 0,
|
||||
"content": "hello\n[Failed to send: note.txt - local delivery media must be an image]",
|
||||
"content": (
|
||||
"hello\n[Failed to send: note.txt - local delivery media must be an image, .mp4 video, "
|
||||
"or .silk voice]"
|
||||
),
|
||||
"msg_id": "msg1",
|
||||
"msg_seq": 2,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_send_local_silk_voice_uses_file_type_three_direct_upload(tmp_path) -> None:
|
||||
workspace = tmp_path / "workspace"
|
||||
workspace.mkdir()
|
||||
out_dir = workspace / "out"
|
||||
out_dir.mkdir()
|
||||
source = out_dir / "reply.silk"
|
||||
source.write_bytes(b"fake-silk")
|
||||
|
||||
channel = QQChannel(
|
||||
QQConfig(app_id="app", secret="secret", allow_from=["*"]),
|
||||
MessageBus(),
|
||||
workspace=workspace,
|
||||
)
|
||||
channel._client = _FakeClient()
|
||||
|
||||
await channel.send(
|
||||
OutboundMessage(
|
||||
channel="qq",
|
||||
chat_id="user123",
|
||||
content="hello",
|
||||
media=[str(source)],
|
||||
metadata={"message_id": "msg1"},
|
||||
)
|
||||
)
|
||||
|
||||
assert channel._client.api.raw_file_upload_calls == [
|
||||
{
|
||||
"method": "POST",
|
||||
"path": "/v2/users/{openid}/files",
|
||||
"params": {"openid": "user123"},
|
||||
"json": {
|
||||
"file_type": 3,
|
||||
"file_data": b64encode(b"fake-silk").decode("ascii"),
|
||||
"srv_send_msg": False,
|
||||
},
|
||||
}
|
||||
]
|
||||
assert channel._client.api.c2c_calls == [
|
||||
{
|
||||
"openid": "user123",
|
||||
"msg_type": 7,
|
||||
"content": "hello",
|
||||
"media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
|
||||
"msg_id": "msg1",
|
||||
"msg_seq": 2,
|
||||
}
|
||||
|
||||
321
tests/test_voice_reply.py
Normal file
321
tests/test_voice_reply.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""Tests for optional outbound voice replies."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from nanobot.bus.events import InboundMessage
|
||||
from nanobot.config.schema import Config
|
||||
from nanobot.providers.base import LLMResponse
|
||||
from nanobot.providers.speech import OpenAISpeechProvider
|
||||
|
||||
|
||||
def _make_loop(workspace: Path, *, channels_payload: dict | None = None):
|
||||
"""Create an AgentLoop with lightweight mocks and configurable channels."""
|
||||
from nanobot.agent.loop import AgentLoop
|
||||
from nanobot.bus.queue import MessageBus
|
||||
|
||||
bus = MessageBus()
|
||||
provider = MagicMock()
|
||||
provider.get_default_model.return_value = "test-model"
|
||||
provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[]))
|
||||
provider.api_key = ""
|
||||
provider.api_base = None
|
||||
|
||||
config = Config.model_validate({"channels": channels_payload or {}})
|
||||
|
||||
with patch("nanobot.agent.loop.SubagentManager"):
|
||||
loop = AgentLoop(
|
||||
bus=bus,
|
||||
provider=provider,
|
||||
workspace=workspace,
|
||||
channels_config=config.channels,
|
||||
)
|
||||
return loop, provider
|
||||
|
||||
|
||||
def test_voice_reply_config_parses_camel_case() -> None:
|
||||
config = Config.model_validate(
|
||||
{
|
||||
"channels": {
|
||||
"voiceReply": {
|
||||
"enabled": True,
|
||||
"channels": ["telegram/main"],
|
||||
"model": "gpt-4o-mini-tts",
|
||||
"voice": "alloy",
|
||||
"instructions": "sound calm",
|
||||
"speed": 1.1,
|
||||
"responseFormat": "mp3",
|
||||
"apiKey": "tts-key",
|
||||
"url": "https://tts.example.com/v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
voice_reply = config.channels.voice_reply
|
||||
assert voice_reply.enabled is True
|
||||
assert voice_reply.channels == ["telegram/main"]
|
||||
assert voice_reply.instructions == "sound calm"
|
||||
assert voice_reply.speed == 1.1
|
||||
assert voice_reply.response_format == "mp3"
|
||||
assert voice_reply.api_key == "tts-key"
|
||||
assert voice_reply.api_base == "https://tts.example.com/v1"
|
||||
|
||||
|
||||
def test_openai_speech_provider_accepts_direct_endpoint_url() -> None:
|
||||
provider = OpenAISpeechProvider(
|
||||
api_key="tts-key",
|
||||
api_base="https://tts.example.com/v1/audio/speech",
|
||||
)
|
||||
|
||||
assert provider._speech_url() == "https://tts.example.com/v1/audio/speech"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
(tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8")
|
||||
loop, provider = _make_loop(
|
||||
tmp_path,
|
||||
channels_payload={
|
||||
"voiceReply": {
|
||||
"enabled": True,
|
||||
"channels": ["telegram"],
|
||||
"instructions": "keep the delivery warm",
|
||||
"speed": 1.05,
|
||||
"responseFormat": "opus",
|
||||
}
|
||||
},
|
||||
)
|
||||
provider.api_key = "provider-tts-key"
|
||||
provider.api_base = "https://provider.example.com/v1"
|
||||
|
||||
captured: dict[str, str | float | None] = {}
|
||||
|
||||
async def fake_synthesize_to_file(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
model: str,
|
||||
voice: str,
|
||||
instructions: str | None,
|
||||
speed: float | None,
|
||||
response_format: str,
|
||||
output_path: str | Path,
|
||||
) -> Path:
|
||||
path = Path(output_path)
|
||||
path.write_bytes(b"voice-bytes")
|
||||
captured["api_key"] = self.api_key
|
||||
captured["api_base"] = self.api_base
|
||||
captured["text"] = text
|
||||
captured["model"] = model
|
||||
captured["voice"] = voice
|
||||
captured["instructions"] = instructions
|
||||
captured["speed"] = speed
|
||||
captured["response_format"] = response_format
|
||||
return path
|
||||
|
||||
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
||||
|
||||
response = await loop._process_message(
|
||||
InboundMessage(
|
||||
channel="telegram/main",
|
||||
sender_id="user-1",
|
||||
chat_id="chat-1",
|
||||
content="hello",
|
||||
)
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.content == "hello"
|
||||
assert len(response.media) == 1
|
||||
|
||||
media_path = Path(response.media[0])
|
||||
assert media_path.parent == tmp_path / "out" / "voice"
|
||||
assert media_path.suffix == ".ogg"
|
||||
assert media_path.read_bytes() == b"voice-bytes"
|
||||
|
||||
assert captured == {
|
||||
"api_key": "provider-tts-key",
|
||||
"api_base": "https://provider.example.com/v1",
|
||||
"text": "hello",
|
||||
"model": "gpt-4o-mini-tts",
|
||||
"voice": "alloy",
|
||||
"instructions": (
|
||||
"Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, "
|
||||
"and emotional style while keeping the reply natural and conversational. keep the "
|
||||
"delivery warm Persona guidance: default soul voice"
|
||||
),
|
||||
"speed": 1.05,
|
||||
"response_format": "opus",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_persona_voice_settings_override_global_voice_profile(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
(tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8")
|
||||
persona_dir = tmp_path / "personas" / "coder"
|
||||
persona_dir.mkdir(parents=True)
|
||||
(persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8")
|
||||
(persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8")
|
||||
(persona_dir / "VOICE.json").write_text(
|
||||
'{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
loop, provider = _make_loop(
|
||||
tmp_path,
|
||||
channels_payload={
|
||||
"voiceReply": {
|
||||
"enabled": True,
|
||||
"channels": ["telegram"],
|
||||
"voice": "alloy",
|
||||
"instructions": "keep the pacing steady",
|
||||
}
|
||||
},
|
||||
)
|
||||
provider.api_key = "provider-tts-key"
|
||||
|
||||
session = loop.sessions.get_or_create("telegram:chat-1")
|
||||
session.metadata["persona"] = "coder"
|
||||
loop.sessions.save(session)
|
||||
|
||||
captured: dict[str, str | float | None] = {}
|
||||
|
||||
async def fake_synthesize_to_file(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
model: str,
|
||||
voice: str,
|
||||
instructions: str | None,
|
||||
speed: float | None,
|
||||
response_format: str,
|
||||
output_path: str | Path,
|
||||
) -> Path:
|
||||
path = Path(output_path)
|
||||
path.write_bytes(b"voice-bytes")
|
||||
captured["voice"] = voice
|
||||
captured["instructions"] = instructions
|
||||
captured["speed"] = speed
|
||||
return path
|
||||
|
||||
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
||||
|
||||
response = await loop._process_message(
|
||||
InboundMessage(
|
||||
channel="telegram",
|
||||
sender_id="user-1",
|
||||
chat_id="chat-1",
|
||||
content="hello",
|
||||
)
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert len(response.media) == 1
|
||||
assert captured["voice"] == "nova"
|
||||
assert captured["speed"] == 1.2
|
||||
assert isinstance(captured["instructions"], str)
|
||||
assert "active persona 'coder'" in captured["instructions"]
|
||||
assert "keep the pacing steady" in captured["instructions"]
|
||||
assert "use a crisp and confident delivery" in captured["instructions"]
|
||||
assert "speak like a sharp engineer" in captured["instructions"]
|
||||
assert "be concise and technical" in captured["instructions"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_qq_voice_reply_config_keeps_text_only(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
loop, provider = _make_loop(
|
||||
tmp_path,
|
||||
channels_payload={
|
||||
"voiceReply": {
|
||||
"enabled": True,
|
||||
"channels": ["qq"],
|
||||
"apiKey": "tts-key",
|
||||
}
|
||||
},
|
||||
)
|
||||
provider.api_key = "provider-tts-key"
|
||||
|
||||
synthesize = AsyncMock()
|
||||
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize)
|
||||
|
||||
response = await loop._process_message(
|
||||
InboundMessage(
|
||||
channel="qq",
|
||||
sender_id="user-1",
|
||||
chat_id="chat-1",
|
||||
content="hello",
|
||||
)
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.content == "hello"
|
||||
assert response.media == []
|
||||
synthesize.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_qq_voice_reply_uses_silk_when_configured(
|
||||
tmp_path: Path,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
loop, provider = _make_loop(
|
||||
tmp_path,
|
||||
channels_payload={
|
||||
"voiceReply": {
|
||||
"enabled": True,
|
||||
"channels": ["qq"],
|
||||
"apiKey": "tts-key",
|
||||
"responseFormat": "silk",
|
||||
}
|
||||
},
|
||||
)
|
||||
provider.api_key = "provider-tts-key"
|
||||
|
||||
captured: dict[str, str | None] = {}
|
||||
|
||||
async def fake_synthesize_to_file(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
model: str,
|
||||
voice: str,
|
||||
instructions: str | None,
|
||||
speed: float | None,
|
||||
response_format: str,
|
||||
output_path: str | Path,
|
||||
) -> Path:
|
||||
path = Path(output_path)
|
||||
path.write_bytes(b"fake-silk")
|
||||
captured["response_format"] = response_format
|
||||
return path
|
||||
|
||||
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
||||
|
||||
response = await loop._process_message(
|
||||
InboundMessage(
|
||||
channel="qq",
|
||||
sender_id="user-1",
|
||||
chat_id="chat-1",
|
||||
content="hello",
|
||||
)
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.content == "hello"
|
||||
assert len(response.media) == 1
|
||||
assert Path(response.media[0]).suffix == ".silk"
|
||||
assert captured["response_format"] == "silk"
|
||||
Reference in New Issue
Block a user