feat(voice): add persona-driven tts and qq local media upload

2026-03-23 11:10:27 +08:00
parent b1a08f3bb9
commit d838a12b56
9 changed files with 882 additions and 96 deletions
--- a/tests/test_qq_channel.py
+++ b/tests/test_qq_channel.py
@@ -276,7 +276,6 @@ async def test_send_local_media_under_out_dir_uses_c2c_file_api(
            "params": {"openid": "user123"},
            "json": {
                "file_type": 1,
-                "url": "https://files.example.com/out/demo.png",
                "file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
                "srv_send_msg": False,
            },
@@ -338,7 +337,6 @@ async def test_send_local_media_in_nested_out_path_uses_relative_url(
            "params": {"openid": "user123"},
            "json": {
                "file_type": 1,
-                "url": "https://files.example.com/qq-media/shots/github.png",
                "file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"),
                "srv_send_msg": False,
            },
@@ -408,8 +406,7 @@ async def test_send_local_media_outside_out_falls_back_to_text_notice(


@pytest.mark.asyncio
-async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upload_fails(
-    monkeypatch,
+async def test_send_local_media_with_media_base_url_still_falls_back_to_text_notice_when_file_data_upload_fails(
    tmp_path,
 ) -> None:
    workspace = tmp_path / "workspace"
@@ -431,7 +428,6 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
    )
    channel._client = _FakeClient()
    channel._client.api.raise_on_raw_file_upload = True
-    monkeypatch.setattr("nanobot.channels.qq.validate_url_target", lambda url: (True, ""))

    await channel.send(
        OutboundMessage(
@@ -443,20 +439,12 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl
        )
    )

-    assert channel._client.api.c2c_file_calls == [
-        {
-            "openid": "user123",
-            "file_type": 1,
-            "url": "https://files.example.com/out/demo.png",
-            "srv_send_msg": False,
-        }
-    ]
+    assert channel._client.api.c2c_file_calls == []
    assert channel._client.api.c2c_calls == [
        {
            "openid": "user123",
-            "msg_type": 7,
-            "content": "hello",
-            "media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
+            "msg_type": 0,
+            "content": "hello\n[Failed to send: demo.png - QQ local file_data upload failed]",
            "msg_id": "msg1",
            "msg_seq": 2,
        }
@@ -596,7 +584,60 @@ async def test_send_non_image_media_from_out_falls_back_to_text_notice(
        {
            "openid": "user123",
            "msg_type": 0,
-            "content": "hello\n[Failed to send: note.txt - local delivery media must be an image]",
+            "content": (
+                "hello\n[Failed to send: note.txt - local delivery media must be an image, .mp4 video, "
+                "or .silk voice]"
+            ),
+            "msg_id": "msg1",
+            "msg_seq": 2,
+        }
+    ]
+
+
+@pytest.mark.asyncio
+async def test_send_local_silk_voice_uses_file_type_three_direct_upload(tmp_path) -> None:
+    workspace = tmp_path / "workspace"
+    workspace.mkdir()
+    out_dir = workspace / "out"
+    out_dir.mkdir()
+    source = out_dir / "reply.silk"
+    source.write_bytes(b"fake-silk")
+
+    channel = QQChannel(
+        QQConfig(app_id="app", secret="secret", allow_from=["*"]),
+        MessageBus(),
+        workspace=workspace,
+    )
+    channel._client = _FakeClient()
+
+    await channel.send(
+        OutboundMessage(
+            channel="qq",
+            chat_id="user123",
+            content="hello",
+            media=[str(source)],
+            metadata={"message_id": "msg1"},
+        )
+    )
+
+    assert channel._client.api.raw_file_upload_calls == [
+        {
+            "method": "POST",
+            "path": "/v2/users/{openid}/files",
+            "params": {"openid": "user123"},
+            "json": {
+                "file_type": 3,
+                "file_data": b64encode(b"fake-silk").decode("ascii"),
+                "srv_send_msg": False,
+            },
+        }
+    ]
+    assert channel._client.api.c2c_calls == [
+        {
+            "openid": "user123",
+            "msg_type": 7,
+            "content": "hello",
+            "media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60},
            "msg_id": "msg1",
            "msg_seq": 2,
        }
--- a/tests/test_voice_reply.py
+++ b/tests/test_voice_reply.py
@@ -0,0 +1,321 @@
+"""Tests for optional outbound voice replies."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from nanobot.bus.events import InboundMessage
+from nanobot.config.schema import Config
+from nanobot.providers.base import LLMResponse
+from nanobot.providers.speech import OpenAISpeechProvider
+
+
+def _make_loop(workspace: Path, *, channels_payload: dict | None = None):
+    """Create an AgentLoop with lightweight mocks and configurable channels."""
+    from nanobot.agent.loop import AgentLoop
+    from nanobot.bus.queue import MessageBus
+
+    bus = MessageBus()
+    provider = MagicMock()
+    provider.get_default_model.return_value = "test-model"
+    provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[]))
+    provider.api_key = ""
+    provider.api_base = None
+
+    config = Config.model_validate({"channels": channels_payload or {}})
+
+    with patch("nanobot.agent.loop.SubagentManager"):
+        loop = AgentLoop(
+            bus=bus,
+            provider=provider,
+            workspace=workspace,
+            channels_config=config.channels,
+        )
+    return loop, provider
+
+
+def test_voice_reply_config_parses_camel_case() -> None:
+    config = Config.model_validate(
+        {
+            "channels": {
+                "voiceReply": {
+                    "enabled": True,
+                    "channels": ["telegram/main"],
+                    "model": "gpt-4o-mini-tts",
+                    "voice": "alloy",
+                    "instructions": "sound calm",
+                    "speed": 1.1,
+                    "responseFormat": "mp3",
+                    "apiKey": "tts-key",
+                    "url": "https://tts.example.com/v1",
+                }
+            }
+        }
+    )
+
+    voice_reply = config.channels.voice_reply
+    assert voice_reply.enabled is True
+    assert voice_reply.channels == ["telegram/main"]
+    assert voice_reply.instructions == "sound calm"
+    assert voice_reply.speed == 1.1
+    assert voice_reply.response_format == "mp3"
+    assert voice_reply.api_key == "tts-key"
+    assert voice_reply.api_base == "https://tts.example.com/v1"
+
+
+def test_openai_speech_provider_accepts_direct_endpoint_url() -> None:
+    provider = OpenAISpeechProvider(
+        api_key="tts-key",
+        api_base="https://tts.example.com/v1/audio/speech",
+    )
+
+    assert provider._speech_url() == "https://tts.example.com/v1/audio/speech"
+
+
+@pytest.mark.asyncio
+async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    (tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8")
+    loop, provider = _make_loop(
+        tmp_path,
+        channels_payload={
+            "voiceReply": {
+                "enabled": True,
+                "channels": ["telegram"],
+                "instructions": "keep the delivery warm",
+                "speed": 1.05,
+                "responseFormat": "opus",
+            }
+        },
+    )
+    provider.api_key = "provider-tts-key"
+    provider.api_base = "https://provider.example.com/v1"
+
+    captured: dict[str, str | float | None] = {}
+
+    async def fake_synthesize_to_file(
+        self,
+        text: str,
+        *,
+        model: str,
+        voice: str,
+        instructions: str | None,
+        speed: float | None,
+        response_format: str,
+        output_path: str | Path,
+    ) -> Path:
+        path = Path(output_path)
+        path.write_bytes(b"voice-bytes")
+        captured["api_key"] = self.api_key
+        captured["api_base"] = self.api_base
+        captured["text"] = text
+        captured["model"] = model
+        captured["voice"] = voice
+        captured["instructions"] = instructions
+        captured["speed"] = speed
+        captured["response_format"] = response_format
+        return path
+
+    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
+
+    response = await loop._process_message(
+        InboundMessage(
+            channel="telegram/main",
+            sender_id="user-1",
+            chat_id="chat-1",
+            content="hello",
+        )
+    )
+
+    assert response is not None
+    assert response.content == "hello"
+    assert len(response.media) == 1
+
+    media_path = Path(response.media[0])
+    assert media_path.parent == tmp_path / "out" / "voice"
+    assert media_path.suffix == ".ogg"
+    assert media_path.read_bytes() == b"voice-bytes"
+
+    assert captured == {
+        "api_key": "provider-tts-key",
+        "api_base": "https://provider.example.com/v1",
+        "text": "hello",
+        "model": "gpt-4o-mini-tts",
+        "voice": "alloy",
+        "instructions": (
+            "Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, "
+            "and emotional style while keeping the reply natural and conversational. keep the "
+            "delivery warm Persona guidance: default soul voice"
+        ),
+        "speed": 1.05,
+        "response_format": "opus",
+    }
+
+
+@pytest.mark.asyncio
+async def test_persona_voice_settings_override_global_voice_profile(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    (tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8")
+    persona_dir = tmp_path / "personas" / "coder"
+    persona_dir.mkdir(parents=True)
+    (persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8")
+    (persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8")
+    (persona_dir / "VOICE.json").write_text(
+        '{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}',
+        encoding="utf-8",
+    )
+
+    loop, provider = _make_loop(
+        tmp_path,
+        channels_payload={
+            "voiceReply": {
+                "enabled": True,
+                "channels": ["telegram"],
+                "voice": "alloy",
+                "instructions": "keep the pacing steady",
+            }
+        },
+    )
+    provider.api_key = "provider-tts-key"
+
+    session = loop.sessions.get_or_create("telegram:chat-1")
+    session.metadata["persona"] = "coder"
+    loop.sessions.save(session)
+
+    captured: dict[str, str | float | None] = {}
+
+    async def fake_synthesize_to_file(
+        self,
+        text: str,
+        *,
+        model: str,
+        voice: str,
+        instructions: str | None,
+        speed: float | None,
+        response_format: str,
+        output_path: str | Path,
+    ) -> Path:
+        path = Path(output_path)
+        path.write_bytes(b"voice-bytes")
+        captured["voice"] = voice
+        captured["instructions"] = instructions
+        captured["speed"] = speed
+        return path
+
+    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
+
+    response = await loop._process_message(
+        InboundMessage(
+            channel="telegram",
+            sender_id="user-1",
+            chat_id="chat-1",
+            content="hello",
+        )
+    )
+
+    assert response is not None
+    assert len(response.media) == 1
+    assert captured["voice"] == "nova"
+    assert captured["speed"] == 1.2
+    assert isinstance(captured["instructions"], str)
+    assert "active persona 'coder'" in captured["instructions"]
+    assert "keep the pacing steady" in captured["instructions"]
+    assert "use a crisp and confident delivery" in captured["instructions"]
+    assert "speak like a sharp engineer" in captured["instructions"]
+    assert "be concise and technical" in captured["instructions"]
+
+
+@pytest.mark.asyncio
+async def test_qq_voice_reply_config_keeps_text_only(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    loop, provider = _make_loop(
+        tmp_path,
+        channels_payload={
+            "voiceReply": {
+                "enabled": True,
+                "channels": ["qq"],
+                "apiKey": "tts-key",
+            }
+        },
+    )
+    provider.api_key = "provider-tts-key"
+
+    synthesize = AsyncMock()
+    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize)
+
+    response = await loop._process_message(
+        InboundMessage(
+            channel="qq",
+            sender_id="user-1",
+            chat_id="chat-1",
+            content="hello",
+        )
+    )
+
+    assert response is not None
+    assert response.content == "hello"
+    assert response.media == []
+    synthesize.assert_not_awaited()
+
+
+@pytest.mark.asyncio
+async def test_qq_voice_reply_uses_silk_when_configured(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    loop, provider = _make_loop(
+        tmp_path,
+        channels_payload={
+            "voiceReply": {
+                "enabled": True,
+                "channels": ["qq"],
+                "apiKey": "tts-key",
+                "responseFormat": "silk",
+            }
+        },
+    )
+    provider.api_key = "provider-tts-key"
+
+    captured: dict[str, str | None] = {}
+
+    async def fake_synthesize_to_file(
+        self,
+        text: str,
+        *,
+        model: str,
+        voice: str,
+        instructions: str | None,
+        speed: float | None,
+        response_format: str,
+        output_path: str | Path,
+    ) -> Path:
+        path = Path(output_path)
+        path.write_bytes(b"fake-silk")
+        captured["response_format"] = response_format
+        return path
+
+    monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file)
+
+    response = await loop._process_message(
+        InboundMessage(
+            channel="qq",
+            sender_id="user-1",
+            chat_id="chat-1",
+            content="hello",
+        )
+    )
+
+    assert response is not None
+    assert response.content == "hello"
+    assert len(response.media) == 1
+    assert Path(response.media[0]).suffix == ".silk"
+    assert captured["response_format"] == "silk"