nanobot/tests/test_voice_reply.py

"""Tests for optional outbound voice replies."""

from __future__ import annotations

from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from nanobot.bus.events import InboundMessage
from nanobot.config.schema import Config
from nanobot.providers.base import LLMResponse
from nanobot.providers.speech import OpenAISpeechProvider


def _make_loop(workspace: Path, *, channels_payload: dict | None = None):
    """Create an AgentLoop with lightweight mocks and configurable channels."""
    from nanobot.agent.loop import AgentLoop
    from nanobot.bus.queue import MessageBus

    bus = MessageBus()
    provider = MagicMock()
    provider.get_default_model.return_value = "test-model"
    provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[]))
    provider.api_key = ""
    provider.api_base = None

    config = Config.model_validate({"channels": channels_payload or {}})

    with patch("nanobot.agent.loop.SubagentManager"):
        loop = AgentLoop(
            bus=bus,
            provider=provider,
            workspace=workspace,
            channels_config=config.channels,
        )
    return loop, provider


def test_voice_reply_config_parses_camel_case() -> None:
    config = Config.model_validate(
        {
            "channels": {
                "voiceReply": {
                    "enabled": True,
                    "channels": ["telegram/main"],
                    "model": "gpt-4o-mini-tts",
                    "voice": "alloy",
                    "instructions": "sound calm",
                    "speed": 1.1,
                    "responseFormat": "mp3",
                    "apiKey": "tts-key",
                    "url": "https://tts.example.com/v1",
                }
            }
        }
    )

    voice_reply = config.channels.voice_reply
    assert voice_reply.enabled is True
    assert voice_reply.channels == ["telegram/main"]
    assert voice_reply.instructions == "sound calm"
    assert voice_reply.speed == 1.1
    assert voice_reply.response_format == "mp3"
    assert voice_reply.api_key == "tts-key"
    assert voice_reply.api_base == "https://tts.example.com/v1"


def test_openai_speech_provider_accepts_direct_endpoint_url() -> None:
    provider = OpenAISpeechProvider(
        api_key="tts-key",
        api_base="https://tts.example.com/v1/audio/speech",
    )

    assert provider._speech_url() == "https://tts.example.com/v1/audio/speech"


@pytest.mark.asyncio
async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    (tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8")
    loop, provider = _make_loop(
        tmp_path,
        channels_payload={
            "voiceReply": {
                "enabled": True,
                "channels": ["telegram"],
                "instructions": "keep the delivery warm",
                "speed": 1.05,
                "responseFormat": "opus",
            }
        },
    )
    provider.api_key = "provider-tts-key"
    provider.api_base = "https://provider.example.com/v1"

    captured: dict[str, str | float | None] = {}

    async def fake_synthesize_to_file(
        self,
        text: str,
        *,
        model: str,
        voice: str,
        instructions: str | None,
        speed: float | None,
        response_format: str,
        output_path: str | Path,
    ) -> Path:
        path = Path(output_path)
        path.write_bytes(b"voice-bytes")
        captured["api_key"] = self.api_key
        captured["api_base"] = self.api_base
        captured["text"] = text
        captured["model"] = model
        captured["voice"] = voice
        captured["instructions"] = instructions
        captured["speed"] = speed
        captured["response_format"] = response_format
        return path

    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)

    response = await loop._process_message(
        InboundMessage(
            channel="telegram/main",
            sender_id="user-1",
            chat_id="chat-1",
            content="hello",
        )
    )

    assert response is not None
    assert response.content == "hello"
    assert len(response.media) == 1

    media_path = Path(response.media[0])
    assert media_path.parent == tmp_path / "out" / "voice"
    assert media_path.suffix == ".ogg"
    assert media_path.read_bytes() == b"voice-bytes"

    assert captured == {
        "api_key": "provider-tts-key",
        "api_base": "https://provider.example.com/v1",
        "text": "hello",
        "model": "gpt-4o-mini-tts",
        "voice": "alloy",
        "instructions": (
            "Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, "
            "and emotional style while keeping the reply natural and conversational. keep the "
            "delivery warm Persona guidance: default soul voice"
        ),
        "speed": 1.05,
        "response_format": "opus",
    }


@pytest.mark.asyncio
async def test_persona_voice_settings_override_global_voice_profile(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    (tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8")
    persona_dir = tmp_path / "personas" / "coder"
    persona_dir.mkdir(parents=True)
    (persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8")
    (persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8")
    (persona_dir / "VOICE.json").write_text(
        '{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}',
        encoding="utf-8",
    )

    loop, provider = _make_loop(
        tmp_path,
        channels_payload={
            "voiceReply": {
                "enabled": True,
                "channels": ["telegram"],
                "voice": "alloy",
                "instructions": "keep the pacing steady",
            }
        },
    )
    provider.api_key = "provider-tts-key"

    session = loop.sessions.get_or_create("telegram:chat-1")
    session.metadata["persona"] = "coder"
    loop.sessions.save(session)

    captured: dict[str, str | float | None] = {}

    async def fake_synthesize_to_file(
        self,
        text: str,
        *,
        model: str,
        voice: str,
        instructions: str | None,
        speed: float | None,
        response_format: str,
        output_path: str | Path,
    ) -> Path:
        path = Path(output_path)
        path.write_bytes(b"voice-bytes")
        captured["voice"] = voice
        captured["instructions"] = instructions
        captured["speed"] = speed
        return path

    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)

    response = await loop._process_message(
        InboundMessage(
            channel="telegram",
            sender_id="user-1",
            chat_id="chat-1",
            content="hello",
        )
    )

    assert response is not None
    assert len(response.media) == 1
    assert captured["voice"] == "nova"
    assert captured["speed"] == 1.2
    assert isinstance(captured["instructions"], str)
    assert "active persona 'coder'" in captured["instructions"]
    assert "keep the pacing steady" in captured["instructions"]
    assert "use a crisp and confident delivery" in captured["instructions"]
    assert "speak like a sharp engineer" in captured["instructions"]
    assert "be concise and technical" in captured["instructions"]


@pytest.mark.asyncio
async def test_qq_voice_reply_config_keeps_text_only(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    loop, provider = _make_loop(
        tmp_path,
        channels_payload={
            "voiceReply": {
                "enabled": True,
                "channels": ["qq"],
                "apiKey": "tts-key",
            }
        },
    )
    provider.api_key = "provider-tts-key"

    synthesize = AsyncMock()
    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize)

    response = await loop._process_message(
        InboundMessage(
            channel="qq",
            sender_id="user-1",
            chat_id="chat-1",
            content="hello",
        )
    )

    assert response is not None
    assert response.content == "hello"
    assert response.media == []
    synthesize.assert_not_awaited()


@pytest.mark.asyncio
async def test_qq_voice_reply_uses_silk_when_configured(
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    loop, provider = _make_loop(
        tmp_path,
        channels_payload={
            "voiceReply": {
                "enabled": True,
                "channels": ["qq"],
                "apiKey": "tts-key",
                "responseFormat": "silk",
            }
        },
    )
    provider.api_key = "provider-tts-key"

    captured: dict[str, str | None] = {}

    async def fake_synthesize_to_file(
        self,
        text: str,
        *,
        model: str,
        voice: str,
        instructions: str | None,
        speed: float | None,
        response_format: str,
        output_path: str | Path,
    ) -> Path:
        path = Path(output_path)
        path.write_bytes(b"fake-silk")
        captured["response_format"] = response_format
        return path

    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)

    response = await loop._process_message(
        InboundMessage(
            channel="qq",
            sender_id="user-1",
            chat_id="chat-1",
            content="hello",
        )
    )

    assert response is not None
    assert response.content == "hello"
    assert len(response.media) == 1
    assert Path(response.media[0]).suffix == ".silk"
    assert captured["response_format"] == "silk"