322 lines
9.7 KiB
Python
322 lines
9.7 KiB
Python
"""Tests for optional outbound voice replies."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from nanobot.bus.events import InboundMessage
|
|
from nanobot.config.schema import Config
|
|
from nanobot.providers.base import LLMResponse
|
|
from nanobot.providers.speech import OpenAISpeechProvider
|
|
|
|
|
|
def _make_loop(workspace: Path, *, channels_payload: dict | None = None):
|
|
"""Create an AgentLoop with lightweight mocks and configurable channels."""
|
|
from nanobot.agent.loop import AgentLoop
|
|
from nanobot.bus.queue import MessageBus
|
|
|
|
bus = MessageBus()
|
|
provider = MagicMock()
|
|
provider.get_default_model.return_value = "test-model"
|
|
provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[]))
|
|
provider.api_key = ""
|
|
provider.api_base = None
|
|
|
|
config = Config.model_validate({"channels": channels_payload or {}})
|
|
|
|
with patch("nanobot.agent.loop.SubagentManager"):
|
|
loop = AgentLoop(
|
|
bus=bus,
|
|
provider=provider,
|
|
workspace=workspace,
|
|
channels_config=config.channels,
|
|
)
|
|
return loop, provider
|
|
|
|
|
|
def test_voice_reply_config_parses_camel_case() -> None:
|
|
config = Config.model_validate(
|
|
{
|
|
"channels": {
|
|
"voiceReply": {
|
|
"enabled": True,
|
|
"channels": ["telegram/main"],
|
|
"model": "gpt-4o-mini-tts",
|
|
"voice": "alloy",
|
|
"instructions": "sound calm",
|
|
"speed": 1.1,
|
|
"responseFormat": "mp3",
|
|
"apiKey": "tts-key",
|
|
"url": "https://tts.example.com/v1",
|
|
}
|
|
}
|
|
}
|
|
)
|
|
|
|
voice_reply = config.channels.voice_reply
|
|
assert voice_reply.enabled is True
|
|
assert voice_reply.channels == ["telegram/main"]
|
|
assert voice_reply.instructions == "sound calm"
|
|
assert voice_reply.speed == 1.1
|
|
assert voice_reply.response_format == "mp3"
|
|
assert voice_reply.api_key == "tts-key"
|
|
assert voice_reply.api_base == "https://tts.example.com/v1"
|
|
|
|
|
|
def test_openai_speech_provider_accepts_direct_endpoint_url() -> None:
|
|
provider = OpenAISpeechProvider(
|
|
api_key="tts-key",
|
|
api_base="https://tts.example.com/v1/audio/speech",
|
|
)
|
|
|
|
assert provider._speech_url() == "https://tts.example.com/v1/audio/speech"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route(
|
|
tmp_path: Path,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
(tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8")
|
|
loop, provider = _make_loop(
|
|
tmp_path,
|
|
channels_payload={
|
|
"voiceReply": {
|
|
"enabled": True,
|
|
"channels": ["telegram"],
|
|
"instructions": "keep the delivery warm",
|
|
"speed": 1.05,
|
|
"responseFormat": "opus",
|
|
}
|
|
},
|
|
)
|
|
provider.api_key = "provider-tts-key"
|
|
provider.api_base = "https://provider.example.com/v1"
|
|
|
|
captured: dict[str, str | float | None] = {}
|
|
|
|
async def fake_synthesize_to_file(
|
|
self,
|
|
text: str,
|
|
*,
|
|
model: str,
|
|
voice: str,
|
|
instructions: str | None,
|
|
speed: float | None,
|
|
response_format: str,
|
|
output_path: str | Path,
|
|
) -> Path:
|
|
path = Path(output_path)
|
|
path.write_bytes(b"voice-bytes")
|
|
captured["api_key"] = self.api_key
|
|
captured["api_base"] = self.api_base
|
|
captured["text"] = text
|
|
captured["model"] = model
|
|
captured["voice"] = voice
|
|
captured["instructions"] = instructions
|
|
captured["speed"] = speed
|
|
captured["response_format"] = response_format
|
|
return path
|
|
|
|
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
|
|
|
response = await loop._process_message(
|
|
InboundMessage(
|
|
channel="telegram/main",
|
|
sender_id="user-1",
|
|
chat_id="chat-1",
|
|
content="hello",
|
|
)
|
|
)
|
|
|
|
assert response is not None
|
|
assert response.content == "hello"
|
|
assert len(response.media) == 1
|
|
|
|
media_path = Path(response.media[0])
|
|
assert media_path.parent == tmp_path / "out" / "voice"
|
|
assert media_path.suffix == ".ogg"
|
|
assert media_path.read_bytes() == b"voice-bytes"
|
|
|
|
assert captured == {
|
|
"api_key": "provider-tts-key",
|
|
"api_base": "https://provider.example.com/v1",
|
|
"text": "hello",
|
|
"model": "gpt-4o-mini-tts",
|
|
"voice": "alloy",
|
|
"instructions": (
|
|
"Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, "
|
|
"and emotional style while keeping the reply natural and conversational. keep the "
|
|
"delivery warm Persona guidance: default soul voice"
|
|
),
|
|
"speed": 1.05,
|
|
"response_format": "opus",
|
|
}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_persona_voice_settings_override_global_voice_profile(
|
|
tmp_path: Path,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
(tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8")
|
|
persona_dir = tmp_path / "personas" / "coder"
|
|
persona_dir.mkdir(parents=True)
|
|
(persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8")
|
|
(persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8")
|
|
(persona_dir / "VOICE.json").write_text(
|
|
'{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}',
|
|
encoding="utf-8",
|
|
)
|
|
|
|
loop, provider = _make_loop(
|
|
tmp_path,
|
|
channels_payload={
|
|
"voiceReply": {
|
|
"enabled": True,
|
|
"channels": ["telegram"],
|
|
"voice": "alloy",
|
|
"instructions": "keep the pacing steady",
|
|
}
|
|
},
|
|
)
|
|
provider.api_key = "provider-tts-key"
|
|
|
|
session = loop.sessions.get_or_create("telegram:chat-1")
|
|
session.metadata["persona"] = "coder"
|
|
loop.sessions.save(session)
|
|
|
|
captured: dict[str, str | float | None] = {}
|
|
|
|
async def fake_synthesize_to_file(
|
|
self,
|
|
text: str,
|
|
*,
|
|
model: str,
|
|
voice: str,
|
|
instructions: str | None,
|
|
speed: float | None,
|
|
response_format: str,
|
|
output_path: str | Path,
|
|
) -> Path:
|
|
path = Path(output_path)
|
|
path.write_bytes(b"voice-bytes")
|
|
captured["voice"] = voice
|
|
captured["instructions"] = instructions
|
|
captured["speed"] = speed
|
|
return path
|
|
|
|
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
|
|
|
response = await loop._process_message(
|
|
InboundMessage(
|
|
channel="telegram",
|
|
sender_id="user-1",
|
|
chat_id="chat-1",
|
|
content="hello",
|
|
)
|
|
)
|
|
|
|
assert response is not None
|
|
assert len(response.media) == 1
|
|
assert captured["voice"] == "nova"
|
|
assert captured["speed"] == 1.2
|
|
assert isinstance(captured["instructions"], str)
|
|
assert "active persona 'coder'" in captured["instructions"]
|
|
assert "keep the pacing steady" in captured["instructions"]
|
|
assert "use a crisp and confident delivery" in captured["instructions"]
|
|
assert "speak like a sharp engineer" in captured["instructions"]
|
|
assert "be concise and technical" in captured["instructions"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_qq_voice_reply_config_keeps_text_only(
|
|
tmp_path: Path,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
loop, provider = _make_loop(
|
|
tmp_path,
|
|
channels_payload={
|
|
"voiceReply": {
|
|
"enabled": True,
|
|
"channels": ["qq"],
|
|
"apiKey": "tts-key",
|
|
}
|
|
},
|
|
)
|
|
provider.api_key = "provider-tts-key"
|
|
|
|
synthesize = AsyncMock()
|
|
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize)
|
|
|
|
response = await loop._process_message(
|
|
InboundMessage(
|
|
channel="qq",
|
|
sender_id="user-1",
|
|
chat_id="chat-1",
|
|
content="hello",
|
|
)
|
|
)
|
|
|
|
assert response is not None
|
|
assert response.content == "hello"
|
|
assert response.media == []
|
|
synthesize.assert_not_awaited()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_qq_voice_reply_uses_silk_when_configured(
|
|
tmp_path: Path,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
loop, provider = _make_loop(
|
|
tmp_path,
|
|
channels_payload={
|
|
"voiceReply": {
|
|
"enabled": True,
|
|
"channels": ["qq"],
|
|
"apiKey": "tts-key",
|
|
"responseFormat": "silk",
|
|
}
|
|
},
|
|
)
|
|
provider.api_key = "provider-tts-key"
|
|
|
|
captured: dict[str, str | None] = {}
|
|
|
|
async def fake_synthesize_to_file(
|
|
self,
|
|
text: str,
|
|
*,
|
|
model: str,
|
|
voice: str,
|
|
instructions: str | None,
|
|
speed: float | None,
|
|
response_format: str,
|
|
output_path: str | Path,
|
|
) -> Path:
|
|
path = Path(output_path)
|
|
path.write_bytes(b"fake-silk")
|
|
captured["response_format"] = response_format
|
|
return path
|
|
|
|
monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
|
|
|
|
response = await loop._process_message(
|
|
InboundMessage(
|
|
channel="qq",
|
|
sender_id="user-1",
|
|
chat_id="chat-1",
|
|
content="hello",
|
|
)
|
|
)
|
|
|
|
assert response is not None
|
|
assert response.content == "hello"
|
|
assert len(response.media) == 1
|
|
assert Path(response.media[0]).suffix == ".silk"
|
|
assert captured["response_format"] == "silk"
|