From d838a12b566f59877e1b8f507a888a3390e42a72 Mon Sep 17 00:00:00 2001 From: Hua Date: Mon, 23 Mar 2026 11:10:27 +0800 Subject: [PATCH] feat(voice): add persona-driven tts and qq local media upload --- AGENTS.md | 7 +- README.md | 61 ++++++- nanobot/agent/loop.py | 156 +++++++++++++++++- nanobot/agent/personas.py | 102 ++++++++++++ nanobot/channels/qq.py | 147 +++++++++-------- nanobot/config/schema.py | 21 ++- nanobot/providers/speech.py | 88 ++++++++++ tests/test_qq_channel.py | 75 +++++++-- tests/test_voice_reply.py | 321 ++++++++++++++++++++++++++++++++++++ 9 files changed, 882 insertions(+), 96 deletions(-) create mode 100644 nanobot/providers/speech.py create mode 100644 tests/test_voice_reply.py diff --git a/AGENTS.md b/AGENTS.md index 79fee02..781d05a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,10 +33,13 @@ Do not commit real API keys, tokens, chat logs, or workspace data. Keep local se - `/skill` currently supports `search`, `install`, `uninstall`, `list`, and `update`. Keep subcommand dispatch in `nanobot/agent/loop.py`. - `/mcp` supports the default `list` behavior (and explicit `/mcp list`) to show configured MCP servers and registered MCP tools. - `/status` should return plain-text runtime info for the active session and stay wired into `/help` plus Telegram's command menu/localization coverage. -- Agent runtime config should be hot-reloaded from the active `config.json` for safe in-process fields such as `tools.mcpServers`, `tools.web.*`, `tools.exec.*`, `tools.restrictToWorkspace`, `agents.defaults.model`, `agents.defaults.maxToolIterations`, `agents.defaults.contextWindowTokens`, `agents.defaults.maxTokens`, `agents.defaults.temperature`, `agents.defaults.reasoningEffort`, `channels.sendProgress`, and `channels.sendToolHints`. Channel connection settings and provider credentials still require a restart. +- Agent runtime config should be hot-reloaded from the active `config.json` for safe in-process fields such as `tools.mcpServers`, `tools.web.*`, `tools.exec.*`, `tools.restrictToWorkspace`, `agents.defaults.model`, `agents.defaults.maxToolIterations`, `agents.defaults.contextWindowTokens`, `agents.defaults.maxTokens`, `agents.defaults.temperature`, `agents.defaults.reasoningEffort`, `channels.sendProgress`, `channels.sendToolHints`, and `channels.voiceReply.*`. Channel connection settings and provider credentials still require a restart. - nanobot does not expose local files over HTTP. If a feature needs a public URL for local files, provide your own static file server and point config such as `mediaBaseUrl` at it. - Generated screenshots, downloads, and other temporary user-delivery artifacts should be written under `workspace/out`, not the workspace root. Treat that as the generic delivery-artifact root for tools, MCP servers, and skills. -- QQ outbound media sends remote `http(s)` image URLs directly. For local QQ images, try `file_data` upload first. If `mediaBaseUrl` is configured, keep the URL-based path available as a fallback for SDK/runtime compatibility; without it, there is no URL fallback. +- QQ outbound media can send remote rich-media URLs directly. For local QQ media under `workspace/out`, use direct `file_data` upload only; do not rely on URL fallback for local files. Supported local QQ rich media are images, `.mp4` video, and `.silk` voice. +- `channels.voiceReply` currently adds TTS attachments on supported outbound channels such as Telegram, and QQ when the configured TTS endpoint returns `silk`. Preserve plain-text fallback when QQ voice requirements are not met. +- Voice replies should follow the active session persona. Build TTS style instructions from the resolved persona's prompt files, and allow optional persona-local overrides from `VOICE.json` under the persona workspace (`/VOICE.json` for default, `/personas//VOICE.json` for custom personas). +- `channels.voiceReply.url` may override the TTS endpoint independently of the chat model provider. When omitted, fall back to the active conversation provider URL. Keep `apiBase` accepted as a compatibility alias. - `/skill` shells out to `npx clawhub@latest`; it requires Node.js/`npx` at runtime. - `/skill uninstall` runs in a non-interactive context, so keep passing `--yes` when shelling out to ClawHub. - Treat empty `/skill search` output as a user-visible "no results" case rather than a silent success. Surface npm/registry failures directly to the user. diff --git a/README.md b/README.md index 45c2852..ed6a84d 100644 --- a/README.md +++ b/README.md @@ -264,6 +264,57 @@ That's it! You have a working AI assistant in 2 minutes. `baseUrl` can point either to the SearXNG root (for example `http://localhost:8080`) or directly to `/search`. +### Optional: Voice Replies + +Enable `channels.voiceReply` when you want nanobot to attach a synthesized voice reply on +supported outbound channels such as Telegram. QQ voice replies are also supported when your TTS +endpoint can return `silk`. + +```json +{ + "channels": { + "voiceReply": { + "enabled": true, + "channels": ["telegram"], + "url": "https://your-tts-endpoint.example.com/v1", + "model": "gpt-4o-mini-tts", + "voice": "alloy", + "instructions": "keep the delivery calm and clear", + "speed": 1.0, + "responseFormat": "opus" + } + } +} +``` + +`voiceReply` currently adds a voice attachment while keeping the normal text reply. For QQ voice +delivery, use `responseFormat: "silk"` because QQ local voice upload expects `.silk`. If `apiKey` +and `apiBase` are omitted, nanobot falls back to the active provider credentials; use an +OpenAI-compatible TTS endpoint for this. +`voiceReply.url` is optional and can point either to a provider base URL such as +`https://api.openai.com/v1` or directly to an `/audio/speech` endpoint. If omitted, nanobot uses +the current conversation provider URL. `apiBase` remains supported as a legacy alias. + +Voice replies automatically follow the active session persona. nanobot builds TTS style +instructions from that persona's `SOUL.md` and `USER.md`, so switching `/persona` changes both the +text response style and the generated speech style together. + +If a specific persona needs a fixed voice or speaking pattern, add `VOICE.json` under the persona +workspace: + +- Default persona: `/VOICE.json` +- Custom persona: `/personas//VOICE.json` + +Example: + +```json +{ + "voice": "nova", + "instructions": "sound crisp, confident, and slightly faster than normal", + "speed": 1.15 +} +``` + ## 💬 Chat Apps Connect nanobot to your favorite chat platform. Want to build your own? See the [Channel Plugin Guide](./docs/CHANNEL_PLUGIN_GUIDE.md). @@ -708,10 +759,10 @@ Uses **botpy SDK** with WebSocket — no public IP required. Currently supports } ``` -`mediaBaseUrl` is optional. For local QQ images, nanobot will first try direct `file_data` upload -from generated delivery artifacts under `workspace/out`. Configuring `mediaBaseUrl` is still -recommended, because nanobot can then map those files onto your own static file server and fall -back to the URL-based rich-media flow when needed. +For local QQ media, nanobot uploads files directly with `file_data` from generated delivery +artifacts under `workspace/out`. Local uploads do not require `mediaBaseUrl`, and nanobot does not +fall back to URL-based upload for local files anymore. Supported local QQ rich media are images, +`.mp4` video, and `.silk` voice. Multi-bot example: @@ -1245,7 +1296,7 @@ Use `toolTimeout` to override the default 30s per-call timeout for slow servers: ``` MCP tools are automatically discovered and registered on startup. The LLM can use them alongside built-in tools — no extra configuration needed. -nanobot hot-reloads agent runtime config from the active `config.json` on the next message, including `tools.mcpServers`, `tools.web.*`, `tools.exec.*`, `tools.restrictToWorkspace`, `agents.defaults.model`, `agents.defaults.maxToolIterations`, `agents.defaults.contextWindowTokens`, `agents.defaults.maxTokens`, `agents.defaults.temperature`, `agents.defaults.reasoningEffort`, `channels.sendProgress`, and `channels.sendToolHints`. Channel connection settings and provider credentials still require a restart. +nanobot hot-reloads agent runtime config from the active `config.json` on the next message, including `tools.mcpServers`, `tools.web.*`, `tools.exec.*`, `tools.restrictToWorkspace`, `agents.defaults.model`, `agents.defaults.maxToolIterations`, `agents.defaults.contextWindowTokens`, `agents.defaults.maxTokens`, `agents.defaults.temperature`, `agents.defaults.reasoningEffort`, `channels.sendProgress`, `channels.sendToolHints`, and `channels.voiceReply.*`. Channel connection settings and provider credentials still require a restart. diff --git a/nanobot/agent/loop.py b/nanobot/agent/loop.py index b5321c2..57a66d4 100644 --- a/nanobot/agent/loop.py +++ b/nanobot/agent/loop.py @@ -28,6 +28,7 @@ from nanobot.agent.i18n import ( text, ) from nanobot.agent.memory import MemoryConsolidator +from nanobot.agent.personas import build_persona_voice_instructions, load_persona_voice_settings from nanobot.agent.skills import BUILTIN_SKILLS_DIR from nanobot.agent.subagent import SubagentManager from nanobot.agent.tools.cron import CronTool @@ -40,8 +41,9 @@ from nanobot.agent.tools.web import WebFetchTool, WebSearchTool from nanobot.bus.events import InboundMessage, OutboundMessage from nanobot.bus.queue import MessageBus from nanobot.providers.base import LLMProvider +from nanobot.providers.speech import OpenAISpeechProvider from nanobot.session.manager import Session, SessionManager -from nanobot.utils.helpers import build_status_content +from nanobot.utils.helpers import build_status_content, ensure_dir, safe_filename if TYPE_CHECKING: from nanobot.config.schema import ChannelsConfig, ExecToolConfig @@ -675,6 +677,137 @@ class AgentLoop: metadata={"render_as": "text"}, ) + @staticmethod + def _voice_reply_extension(response_format: str) -> str: + """Map TTS response formats to delivery file extensions.""" + return { + "opus": ".ogg", + "mp3": ".mp3", + "aac": ".aac", + "flac": ".flac", + "wav": ".wav", + "pcm": ".pcm", + "silk": ".silk", + }.get(response_format, f".{response_format}") + + @staticmethod + def _channel_base_name(channel: str) -> str: + """Normalize multi-instance channel routes such as telegram/main.""" + return channel.split("/", 1)[0].lower() + + def _voice_reply_enabled_for_channel(self, channel: str) -> bool: + """Return True when voice replies are enabled for the given channel.""" + cfg = getattr(self.channels_config, "voice_reply", None) + if not cfg or not getattr(cfg, "enabled", False): + return False + route_name = channel.lower() + base_name = self._channel_base_name(channel) + enabled_channels = { + name.lower() for name in getattr(cfg, "channels", []) if isinstance(name, str) + } + if route_name not in enabled_channels and base_name not in enabled_channels: + return False + if base_name == "qq": + return getattr(cfg, "response_format", "opus") == "silk" + return base_name in {"telegram", "qq"} + + def _voice_reply_profile( + self, + persona: str | None, + ) -> tuple[str, str | None, float | None]: + """Resolve voice, instructions, and speed for the active persona.""" + cfg = getattr(self.channels_config, "voice_reply", None) + persona_voice = load_persona_voice_settings(self.workspace, persona) + + extra_instructions = [ + value.strip() + for value in ( + getattr(cfg, "instructions", "") if cfg is not None else "", + persona_voice.instructions or "", + ) + if isinstance(value, str) and value.strip() + ] + instructions = build_persona_voice_instructions( + self.workspace, + persona, + extra_instructions=" ".join(extra_instructions) if extra_instructions else None, + ) + voice = persona_voice.voice or getattr(cfg, "voice", "alloy") + speed = ( + persona_voice.speed + if persona_voice.speed is not None + else getattr(cfg, "speed", None) if cfg is not None else None + ) + return voice, instructions, speed + + async def _maybe_attach_voice_reply( + self, + outbound: OutboundMessage | None, + *, + persona: str | None = None, + ) -> OutboundMessage | None: + """Optionally synthesize the final text reply into a voice attachment.""" + if ( + outbound is None + or not outbound.content + or not self._voice_reply_enabled_for_channel(outbound.channel) + ): + return outbound + + cfg = getattr(self.channels_config, "voice_reply", None) + if cfg is None: + return outbound + + api_key = (getattr(cfg, "api_key", "") or getattr(self.provider, "api_key", "") or "").strip() + if not api_key: + logger.warning( + "Voice reply enabled for {}, but no TTS api_key is configured", + outbound.channel, + ) + return outbound + + api_base = ( + getattr(cfg, "api_base", "") + or getattr(self.provider, "api_base", "") + or "https://api.openai.com/v1" + ).strip() + response_format = getattr(cfg, "response_format", "opus") + model = getattr(cfg, "model", "gpt-4o-mini-tts") + voice, instructions, speed = self._voice_reply_profile(persona) + media_dir = ensure_dir(self.workspace / "out" / "voice") + filename = safe_filename( + f"{outbound.channel}_{outbound.chat_id}_{int(time.time() * 1000)}" + ) + self._voice_reply_extension(response_format) + output_path = media_dir / filename + + try: + provider = OpenAISpeechProvider(api_key=api_key, api_base=api_base) + await provider.synthesize_to_file( + outbound.content, + model=model, + voice=voice, + instructions=instructions, + speed=speed, + response_format=response_format, + output_path=output_path, + ) + except Exception: + logger.exception( + "Failed to synthesize voice reply for {}:{}", + outbound.channel, + outbound.chat_id, + ) + return outbound + + return OutboundMessage( + channel=outbound.channel, + chat_id=outbound.chat_id, + content=outbound.content, + reply_to=outbound.reply_to, + media=[*(outbound.media or []), str(output_path)], + metadata=dict(outbound.metadata or {}), + ) + async def _run_agent_loop( self, initial_messages: list[dict], @@ -1072,8 +1205,14 @@ class AgentLoop: self._save_turn(session, all_msgs, 1 + len(history)) self.sessions.save(session) self._ensure_background_token_consolidation(session) - return OutboundMessage(channel=channel, chat_id=chat_id, - content=final_content or "Background task completed.") + return await self._maybe_attach_voice_reply( + OutboundMessage( + channel=channel, + chat_id=chat_id, + content=final_content or "Background task completed.", + ), + persona=persona, + ) preview = msg.content[:80] + "..." if len(msg.content) > 80 else msg.content logger.info("Processing message from {}:{}: {}", msg.channel, msg.sender_id, preview) @@ -1156,9 +1295,14 @@ class AgentLoop: preview = final_content[:120] + "..." if len(final_content) > 120 else final_content logger.info("Response to {}:{}: {}", msg.channel, msg.sender_id, preview) - return OutboundMessage( - channel=msg.channel, chat_id=msg.chat_id, content=final_content, - metadata=msg.metadata or {}, + return await self._maybe_attach_voice_reply( + OutboundMessage( + channel=msg.channel, + chat_id=msg.chat_id, + content=final_content, + metadata=msg.metadata or {}, + ), + persona=persona, ) @staticmethod diff --git a/nanobot/agent/personas.py b/nanobot/agent/personas.py index 3f2572f..73e38ea 100644 --- a/nanobot/agent/personas.py +++ b/nanobot/agent/personas.py @@ -2,12 +2,29 @@ from __future__ import annotations +import json import re +from dataclasses import dataclass from pathlib import Path +from loguru import logger + DEFAULT_PERSONA = "default" PERSONAS_DIRNAME = "personas" +PERSONA_VOICE_FILENAME = "VOICE.json" _VALID_PERSONA_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_-]{0,63}$") +_VOICE_MARKDOWN_RE = re.compile(r"(```[\s\S]*?```|`[^`]*`|!\[[^\]]*\]\([^)]+\)|[#>*_~-]+)") +_VOICE_WHITESPACE_RE = re.compile(r"\s+") +_VOICE_MAX_GUIDANCE_CHARS = 1200 + + +@dataclass(frozen=True) +class PersonaVoiceSettings: + """Optional persona-level voice synthesis overrides.""" + + voice: str | None = None + instructions: str | None = None + speed: float | None = None def normalize_persona_name(name: str | None) -> str | None: @@ -64,3 +81,88 @@ def persona_workspace(workspace: Path, persona: str | None) -> Path: if resolved in (None, DEFAULT_PERSONA): return workspace return personas_root(workspace) / resolved + + +def load_persona_voice_settings(workspace: Path, persona: str | None) -> PersonaVoiceSettings: + """Load optional persona voice overrides from VOICE.json.""" + path = persona_workspace(workspace, persona) / PERSONA_VOICE_FILENAME + if not path.exists(): + return PersonaVoiceSettings() + + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (OSError, ValueError) as exc: + logger.warning("Failed to load persona voice config {}: {}", path, exc) + return PersonaVoiceSettings() + + if not isinstance(data, dict): + logger.warning("Ignoring persona voice config {} because it is not a JSON object", path) + return PersonaVoiceSettings() + + voice = data.get("voice") + if isinstance(voice, str): + voice = voice.strip() or None + else: + voice = None + + instructions = data.get("instructions") + if isinstance(instructions, str): + instructions = instructions.strip() or None + else: + instructions = None + + speed = data.get("speed") + if isinstance(speed, (int, float)): + speed = float(speed) + if not 0.25 <= speed <= 4.0: + logger.warning( + "Ignoring persona voice speed from {} because it is outside 0.25-4.0", + path, + ) + speed = None + else: + speed = None + + return PersonaVoiceSettings(voice=voice, instructions=instructions, speed=speed) + + +def build_persona_voice_instructions( + workspace: Path, + persona: str | None, + *, + extra_instructions: str | None = None, +) -> str: + """Build voice-style instructions from the active persona prompt files.""" + resolved = resolve_persona_name(workspace, persona) or DEFAULT_PERSONA + persona_dir = None if resolved == DEFAULT_PERSONA else personas_root(workspace) / resolved + guidance_parts: list[str] = [] + + for filename in ("SOUL.md", "USER.md"): + file_path = workspace / filename + if persona_dir: + persona_file = persona_dir / filename + if persona_file.exists(): + file_path = persona_file + if not file_path.exists(): + continue + try: + raw = file_path.read_text(encoding="utf-8") + except OSError as exc: + logger.warning("Failed to read persona voice source {}: {}", file_path, exc) + continue + clean = _VOICE_WHITESPACE_RE.sub(" ", _VOICE_MARKDOWN_RE.sub(" ", raw)).strip() + if clean: + guidance_parts.append(clean) + + guidance = " ".join(guidance_parts).strip() + if len(guidance) > _VOICE_MAX_GUIDANCE_CHARS: + guidance = guidance[:_VOICE_MAX_GUIDANCE_CHARS].rstrip() + + segments = [ + f"Speak as the active persona '{resolved}'. Match that persona's tone, attitude, pacing, and emotional style while keeping the reply natural and conversational.", + ] + if extra_instructions: + segments.append(extra_instructions.strip()) + if guidance: + segments.append(f"Persona guidance: {guidance}") + return " ".join(segment for segment in segments if segment) diff --git a/nanobot/channels/qq.py b/nanobot/channels/qq.py index 29c42c7..7f0742c 100644 --- a/nanobot/channels/qq.py +++ b/nanobot/channels/qq.py @@ -5,6 +5,7 @@ import base64 from collections import deque from pathlib import Path from typing import TYPE_CHECKING +from urllib.parse import urlparse from loguru import logger @@ -13,7 +14,7 @@ from nanobot.bus.queue import MessageBus from nanobot.channels.base import BaseChannel from nanobot.config.schema import QQConfig, QQInstanceConfig from nanobot.security.network import validate_url_target -from nanobot.utils.delivery import resolve_delivery_media +from nanobot.utils.delivery import delivery_artifacts_root, is_image_file try: import botpy @@ -97,17 +98,50 @@ class QQChannel(BaseChannel): """Return the active workspace root used by QQ publishing.""" return (self._workspace or Path.cwd()).resolve(strict=False) - async def _publish_local_media( + def _resolve_local_media( self, media_path: str, - ) -> tuple[Path | None, str | None, str | None]: - """Resolve a local delivery artifact and optionally map it to its served URL.""" - local_path, media_url, error = resolve_delivery_media( - media_path, - self._workspace_root(), - self.config.media_base_url, - ) - return local_path, media_url, error + ) -> tuple[Path | None, int | None, str | None]: + """Resolve a local delivery artifact and infer the QQ rich-media file type.""" + source = Path(media_path).expanduser() + try: + resolved = source.resolve(strict=True) + except FileNotFoundError: + return None, None, "local file not found" + except OSError as e: + logger.warning("Failed to resolve local QQ media path {}: {}", media_path, e) + return None, None, "local file unavailable" + + if not resolved.is_file(): + return None, None, "local file not found" + + artifacts_root = delivery_artifacts_root(self._workspace_root()) + try: + resolved.relative_to(artifacts_root) + except ValueError: + return None, None, f"local delivery media must stay under {artifacts_root}" + + suffix = resolved.suffix.lower() + if is_image_file(resolved): + return resolved, 1, None + if suffix == ".mp4": + return resolved, 2, None + if suffix == ".silk": + return resolved, 3, None + return None, None, "local delivery media must be an image, .mp4 video, or .silk voice" + + @staticmethod + def _remote_media_file_type(media_url: str) -> int | None: + """Infer a QQ rich-media file type from a remote URL.""" + path = urlparse(media_url).path.lower() + if path.endswith(".mp4"): + return 2 + if path.endswith(".silk"): + return 3 + image_exts = (".jpg", ".jpeg", ".png", ".gif", ".webp") + if path.endswith(image_exts): + return 1 + return None def _next_msg_seq(self) -> int: """Return the next QQ message sequence number.""" @@ -136,15 +170,16 @@ class QQChannel(BaseChannel): self, chat_id: str, msg_type: str, + file_type: int, media_url: str, content: str | None, msg_id: str | None, ) -> None: - """Send one QQ remote image URL as a rich-media message.""" + """Send one QQ remote rich-media URL as a rich-media message.""" if msg_type == "group": media = await self._client.api.post_group_file( group_openid=chat_id, - file_type=1, + file_type=file_type, url=media_url, srv_send_msg=False, ) @@ -159,7 +194,7 @@ class QQChannel(BaseChannel): else: media = await self._client.api.post_c2c_file( openid=chat_id, - file_type=1, + file_type=file_type, url=media_url, srv_send_msg=False, ) @@ -176,22 +211,20 @@ class QQChannel(BaseChannel): self, chat_id: str, msg_type: str, - media_url: str | None, + file_type: int, local_path: Path, content: str | None, msg_id: str | None, ) -> None: - """Upload a local QQ image using file_data and, when available, a public URL.""" + """Upload a local QQ rich-media file using file_data.""" if not self._client or Route is None: raise RuntimeError("QQ client not initialized") payload = { - "file_type": 1, + "file_type": file_type, "file_data": self._encode_file_data(local_path), "srv_send_msg": False, } - if media_url: - payload["url"] = media_url if msg_type == "group": route = Route("POST", "/v2/groups/{group_openid}/files", group_openid=chat_id) media = await self._client.api._http.request(route, json=payload) @@ -265,15 +298,13 @@ class QQChannel(BaseChannel): fallback_lines: list[str] = [] for media_path in msg.media: - resolved_media = media_path local_media_path: Path | None = None + local_file_type: int | None = None if not self._is_remote_media(media_path): - local_media_path, resolved_media, publish_error = await self._publish_local_media( - media_path - ) + local_media_path, local_file_type, publish_error = self._resolve_local_media(media_path) if local_media_path is None: logger.warning( - "QQ outbound local media could not be published: {} ({})", + "QQ outbound local media could not be uploaded directly: {} ({})", media_path, publish_error, ) @@ -281,65 +312,51 @@ class QQChannel(BaseChannel): self._failed_media_notice(media_path, publish_error) ) continue - - if resolved_media: - ok, error = validate_url_target(resolved_media) + else: + ok, error = validate_url_target(media_path) if not ok: logger.warning("QQ outbound media blocked by URL validation: {}", error) fallback_lines.append(self._failed_media_notice(media_path, error)) continue + remote_file_type = self._remote_media_file_type(media_path) + if remote_file_type is None: + fallback_lines.append( + self._failed_media_notice( + media_path, + "remote QQ media must be an image URL, .mp4 video, or .silk voice", + ) + ) + continue try: if local_media_path is not None: - try: - await self._post_local_media_message( - msg.chat_id, - msg_type, - resolved_media, - local_media_path.resolve(strict=True), - msg.content if msg.content and not content_sent else None, - msg_id, - ) - except Exception as local_upload_error: - if resolved_media: - logger.warning( - "QQ local file_data upload failed for {}: {}, falling back to URL-only upload", - local_media_path, - local_upload_error, - ) - await self._post_remote_media_message( - msg.chat_id, - msg_type, - resolved_media, - msg.content if msg.content and not content_sent else None, - msg_id, - ) - else: - logger.warning( - "QQ local file_data upload failed for {} without mediaBaseUrl fallback: {}", - local_media_path, - local_upload_error, - ) - fallback_lines.append( - self._failed_media_notice( - media_path, - "QQ local file_data upload failed", - ) - ) - continue + await self._post_local_media_message( + msg.chat_id, + msg_type, + local_file_type or 1, + local_media_path.resolve(strict=True), + msg.content if msg.content and not content_sent else None, + msg_id, + ) else: await self._post_remote_media_message( msg.chat_id, msg_type, - resolved_media, + remote_file_type, + media_path, msg.content if msg.content and not content_sent else None, msg_id, ) if msg.content and not content_sent: content_sent = True except Exception as media_error: - logger.error("Error sending QQ media {}: {}", resolved_media, media_error) - fallback_lines.append(self._failed_media_notice(media_path)) + logger.error("Error sending QQ media {}: {}", media_path, media_error) + if local_media_path is not None: + fallback_lines.append( + self._failed_media_notice(media_path, "QQ local file_data upload failed") + ) + else: + fallback_lines.append(self._failed_media_notice(media_path)) text_parts: list[str] = [] if msg.content and not content_sent: diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index 5f0802e..884f93d 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any, Literal -from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator +from pydantic import AliasChoices, BaseModel, ConfigDict, Field, ValidationInfo, field_validator from pydantic.alias_generators import to_camel from pydantic_settings import BaseSettings @@ -13,6 +13,7 @@ class Base(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + class WhatsAppConfig(Base): """WhatsApp channel configuration.""" @@ -356,6 +357,20 @@ class WecomMultiConfig(Base): instances: list[WecomInstanceConfig] = Field(default_factory=list) +class VoiceReplyConfig(Base): + """Optional text-to-speech replies for supported outbound channels.""" + + enabled: bool = False + channels: list[str] = Field(default_factory=lambda: ["telegram"]) + model: str = "gpt-4o-mini-tts" + voice: str = "alloy" + instructions: str = "" + speed: float | None = None + response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm", "silk"] = "opus" + api_key: str = "" + api_base: str = Field(default="", validation_alias=AliasChoices("apiBase", "url")) + + def _coerce_multi_channel_config( value: Any, single_cls: type[BaseModel], @@ -369,11 +384,14 @@ def _coerce_multi_channel_config( if isinstance(value, dict) and "instances" in value: return multi_cls.model_validate(value) return single_cls.model_validate(value) + + class ChannelsConfig(Base): """Configuration for chat channels.""" send_progress: bool = True # stream agent's text progress to the channel send_tool_hints: bool = False # stream tool-call hints (e.g. read_file("…")) + voice_reply: VoiceReplyConfig = Field(default_factory=VoiceReplyConfig) whatsapp: WhatsAppConfig | WhatsAppMultiConfig = Field(default_factory=WhatsAppConfig) telegram: TelegramConfig | TelegramMultiConfig = Field(default_factory=TelegramConfig) discord: DiscordConfig | DiscordMultiConfig = Field(default_factory=DiscordConfig) @@ -515,6 +533,7 @@ class ExecToolConfig(Base): timeout: int = 60 path_append: str = "" + class MCPServerConfig(Base): """MCP server connection configuration (stdio or HTTP).""" diff --git a/nanobot/providers/speech.py b/nanobot/providers/speech.py new file mode 100644 index 0000000..a000d95 --- /dev/null +++ b/nanobot/providers/speech.py @@ -0,0 +1,88 @@ +"""OpenAI-compatible text-to-speech provider.""" + +from __future__ import annotations + +from pathlib import Path + +import httpx + + +class OpenAISpeechProvider: + """Minimal OpenAI-compatible TTS client.""" + + _NO_INSTRUCTIONS_MODELS = {"tts-1", "tts-1-hd"} + + def __init__(self, api_key: str, api_base: str = "https://api.openai.com/v1"): + self.api_key = api_key + self.api_base = api_base.rstrip("/") + + def _speech_url(self) -> str: + """Return the final speech endpoint URL from a base URL or direct endpoint URL.""" + if self.api_base.endswith("/audio/speech"): + return self.api_base + return f"{self.api_base}/audio/speech" + + @classmethod + def _supports_instructions(cls, model: str) -> bool: + """Return True when the target TTS model accepts style instructions.""" + return model not in cls._NO_INSTRUCTIONS_MODELS + + async def synthesize( + self, + text: str, + *, + model: str, + voice: str, + instructions: str | None = None, + speed: float | None = None, + response_format: str, + ) -> bytes: + """Synthesize text into audio bytes.""" + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + payload = { + "model": model, + "voice": voice, + "input": text, + "response_format": response_format, + } + if instructions and self._supports_instructions(model): + payload["instructions"] = instructions + if speed is not None: + payload["speed"] = speed + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + self._speech_url(), + headers=headers, + json=payload, + ) + response.raise_for_status() + return response.content + + async def synthesize_to_file( + self, + text: str, + *, + model: str, + voice: str, + instructions: str | None = None, + speed: float | None = None, + response_format: str, + output_path: str | Path, + ) -> Path: + """Synthesize text and write the audio payload to disk.""" + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes( + await self.synthesize( + text, + model=model, + voice=voice, + instructions=instructions, + speed=speed, + response_format=response_format, + ) + ) + return path diff --git a/tests/test_qq_channel.py b/tests/test_qq_channel.py index 8da68c1..8eefdd8 100644 --- a/tests/test_qq_channel.py +++ b/tests/test_qq_channel.py @@ -276,7 +276,6 @@ async def test_send_local_media_under_out_dir_uses_c2c_file_api( "params": {"openid": "user123"}, "json": { "file_type": 1, - "url": "https://files.example.com/out/demo.png", "file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"), "srv_send_msg": False, }, @@ -338,7 +337,6 @@ async def test_send_local_media_in_nested_out_path_uses_relative_url( "params": {"openid": "user123"}, "json": { "file_type": 1, - "url": "https://files.example.com/qq-media/shots/github.png", "file_data": b64encode(b"\x89PNG\r\n\x1a\nfake-png").decode("ascii"), "srv_send_msg": False, }, @@ -408,8 +406,7 @@ async def test_send_local_media_outside_out_falls_back_to_text_notice( @pytest.mark.asyncio -async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upload_fails( - monkeypatch, +async def test_send_local_media_with_media_base_url_still_falls_back_to_text_notice_when_file_data_upload_fails( tmp_path, ) -> None: workspace = tmp_path / "workspace" @@ -431,7 +428,6 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl ) channel._client = _FakeClient() channel._client.api.raise_on_raw_file_upload = True - monkeypatch.setattr("nanobot.channels.qq.validate_url_target", lambda url: (True, "")) await channel.send( OutboundMessage( @@ -443,20 +439,12 @@ async def test_send_local_media_falls_back_to_url_only_upload_when_file_data_upl ) ) - assert channel._client.api.c2c_file_calls == [ - { - "openid": "user123", - "file_type": 1, - "url": "https://files.example.com/out/demo.png", - "srv_send_msg": False, - } - ] + assert channel._client.api.c2c_file_calls == [] assert channel._client.api.c2c_calls == [ { "openid": "user123", - "msg_type": 7, - "content": "hello", - "media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60}, + "msg_type": 0, + "content": "hello\n[Failed to send: demo.png - QQ local file_data upload failed]", "msg_id": "msg1", "msg_seq": 2, } @@ -596,7 +584,60 @@ async def test_send_non_image_media_from_out_falls_back_to_text_notice( { "openid": "user123", "msg_type": 0, - "content": "hello\n[Failed to send: note.txt - local delivery media must be an image]", + "content": ( + "hello\n[Failed to send: note.txt - local delivery media must be an image, .mp4 video, " + "or .silk voice]" + ), + "msg_id": "msg1", + "msg_seq": 2, + } + ] + + +@pytest.mark.asyncio +async def test_send_local_silk_voice_uses_file_type_three_direct_upload(tmp_path) -> None: + workspace = tmp_path / "workspace" + workspace.mkdir() + out_dir = workspace / "out" + out_dir.mkdir() + source = out_dir / "reply.silk" + source.write_bytes(b"fake-silk") + + channel = QQChannel( + QQConfig(app_id="app", secret="secret", allow_from=["*"]), + MessageBus(), + workspace=workspace, + ) + channel._client = _FakeClient() + + await channel.send( + OutboundMessage( + channel="qq", + chat_id="user123", + content="hello", + media=[str(source)], + metadata={"message_id": "msg1"}, + ) + ) + + assert channel._client.api.raw_file_upload_calls == [ + { + "method": "POST", + "path": "/v2/users/{openid}/files", + "params": {"openid": "user123"}, + "json": { + "file_type": 3, + "file_data": b64encode(b"fake-silk").decode("ascii"), + "srv_send_msg": False, + }, + } + ] + assert channel._client.api.c2c_calls == [ + { + "openid": "user123", + "msg_type": 7, + "content": "hello", + "media": {"file_info": "c2c-file-info", "file_uuid": "c2c-file", "ttl": 60}, "msg_id": "msg1", "msg_seq": 2, } diff --git a/tests/test_voice_reply.py b/tests/test_voice_reply.py new file mode 100644 index 0000000..33bef6a --- /dev/null +++ b/tests/test_voice_reply.py @@ -0,0 +1,321 @@ +"""Tests for optional outbound voice replies.""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from nanobot.bus.events import InboundMessage +from nanobot.config.schema import Config +from nanobot.providers.base import LLMResponse +from nanobot.providers.speech import OpenAISpeechProvider + + +def _make_loop(workspace: Path, *, channels_payload: dict | None = None): + """Create an AgentLoop with lightweight mocks and configurable channels.""" + from nanobot.agent.loop import AgentLoop + from nanobot.bus.queue import MessageBus + + bus = MessageBus() + provider = MagicMock() + provider.get_default_model.return_value = "test-model" + provider.chat_with_retry = AsyncMock(return_value=LLMResponse(content="hello", tool_calls=[])) + provider.api_key = "" + provider.api_base = None + + config = Config.model_validate({"channels": channels_payload or {}}) + + with patch("nanobot.agent.loop.SubagentManager"): + loop = AgentLoop( + bus=bus, + provider=provider, + workspace=workspace, + channels_config=config.channels, + ) + return loop, provider + + +def test_voice_reply_config_parses_camel_case() -> None: + config = Config.model_validate( + { + "channels": { + "voiceReply": { + "enabled": True, + "channels": ["telegram/main"], + "model": "gpt-4o-mini-tts", + "voice": "alloy", + "instructions": "sound calm", + "speed": 1.1, + "responseFormat": "mp3", + "apiKey": "tts-key", + "url": "https://tts.example.com/v1", + } + } + } + ) + + voice_reply = config.channels.voice_reply + assert voice_reply.enabled is True + assert voice_reply.channels == ["telegram/main"] + assert voice_reply.instructions == "sound calm" + assert voice_reply.speed == 1.1 + assert voice_reply.response_format == "mp3" + assert voice_reply.api_key == "tts-key" + assert voice_reply.api_base == "https://tts.example.com/v1" + + +def test_openai_speech_provider_accepts_direct_endpoint_url() -> None: + provider = OpenAISpeechProvider( + api_key="tts-key", + api_base="https://tts.example.com/v1/audio/speech", + ) + + assert provider._speech_url() == "https://tts.example.com/v1/audio/speech" + + +@pytest.mark.asyncio +async def test_telegram_voice_reply_attaches_audio_for_multi_instance_route( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + (tmp_path / "SOUL.md").write_text("default soul voice", encoding="utf-8") + loop, provider = _make_loop( + tmp_path, + channels_payload={ + "voiceReply": { + "enabled": True, + "channels": ["telegram"], + "instructions": "keep the delivery warm", + "speed": 1.05, + "responseFormat": "opus", + } + }, + ) + provider.api_key = "provider-tts-key" + provider.api_base = "https://provider.example.com/v1" + + captured: dict[str, str | float | None] = {} + + async def fake_synthesize_to_file( + self, + text: str, + *, + model: str, + voice: str, + instructions: str | None, + speed: float | None, + response_format: str, + output_path: str | Path, + ) -> Path: + path = Path(output_path) + path.write_bytes(b"voice-bytes") + captured["api_key"] = self.api_key + captured["api_base"] = self.api_base + captured["text"] = text + captured["model"] = model + captured["voice"] = voice + captured["instructions"] = instructions + captured["speed"] = speed + captured["response_format"] = response_format + return path + + monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file) + + response = await loop._process_message( + InboundMessage( + channel="telegram/main", + sender_id="user-1", + chat_id="chat-1", + content="hello", + ) + ) + + assert response is not None + assert response.content == "hello" + assert len(response.media) == 1 + + media_path = Path(response.media[0]) + assert media_path.parent == tmp_path / "out" / "voice" + assert media_path.suffix == ".ogg" + assert media_path.read_bytes() == b"voice-bytes" + + assert captured == { + "api_key": "provider-tts-key", + "api_base": "https://provider.example.com/v1", + "text": "hello", + "model": "gpt-4o-mini-tts", + "voice": "alloy", + "instructions": ( + "Speak as the active persona 'default'. Match that persona's tone, attitude, pacing, " + "and emotional style while keeping the reply natural and conversational. keep the " + "delivery warm Persona guidance: default soul voice" + ), + "speed": 1.05, + "response_format": "opus", + } + + +@pytest.mark.asyncio +async def test_persona_voice_settings_override_global_voice_profile( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + (tmp_path / "SOUL.md").write_text("default soul", encoding="utf-8") + persona_dir = tmp_path / "personas" / "coder" + persona_dir.mkdir(parents=True) + (persona_dir / "SOUL.md").write_text("speak like a sharp engineer", encoding="utf-8") + (persona_dir / "USER.md").write_text("be concise and technical", encoding="utf-8") + (persona_dir / "VOICE.json").write_text( + '{"voice":"nova","instructions":"use a crisp and confident delivery","speed":1.2}', + encoding="utf-8", + ) + + loop, provider = _make_loop( + tmp_path, + channels_payload={ + "voiceReply": { + "enabled": True, + "channels": ["telegram"], + "voice": "alloy", + "instructions": "keep the pacing steady", + } + }, + ) + provider.api_key = "provider-tts-key" + + session = loop.sessions.get_or_create("telegram:chat-1") + session.metadata["persona"] = "coder" + loop.sessions.save(session) + + captured: dict[str, str | float | None] = {} + + async def fake_synthesize_to_file( + self, + text: str, + *, + model: str, + voice: str, + instructions: str | None, + speed: float | None, + response_format: str, + output_path: str | Path, + ) -> Path: + path = Path(output_path) + path.write_bytes(b"voice-bytes") + captured["voice"] = voice + captured["instructions"] = instructions + captured["speed"] = speed + return path + + monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file) + + response = await loop._process_message( + InboundMessage( + channel="telegram", + sender_id="user-1", + chat_id="chat-1", + content="hello", + ) + ) + + assert response is not None + assert len(response.media) == 1 + assert captured["voice"] == "nova" + assert captured["speed"] == 1.2 + assert isinstance(captured["instructions"], str) + assert "active persona 'coder'" in captured["instructions"] + assert "keep the pacing steady" in captured["instructions"] + assert "use a crisp and confident delivery" in captured["instructions"] + assert "speak like a sharp engineer" in captured["instructions"] + assert "be concise and technical" in captured["instructions"] + + +@pytest.mark.asyncio +async def test_qq_voice_reply_config_keeps_text_only( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + loop, provider = _make_loop( + tmp_path, + channels_payload={ + "voiceReply": { + "enabled": True, + "channels": ["qq"], + "apiKey": "tts-key", + } + }, + ) + provider.api_key = "provider-tts-key" + + synthesize = AsyncMock() + monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", synthesize) + + response = await loop._process_message( + InboundMessage( + channel="qq", + sender_id="user-1", + chat_id="chat-1", + content="hello", + ) + ) + + assert response is not None + assert response.content == "hello" + assert response.media == [] + synthesize.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_qq_voice_reply_uses_silk_when_configured( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + loop, provider = _make_loop( + tmp_path, + channels_payload={ + "voiceReply": { + "enabled": True, + "channels": ["qq"], + "apiKey": "tts-key", + "responseFormat": "silk", + } + }, + ) + provider.api_key = "provider-tts-key" + + captured: dict[str, str | None] = {} + + async def fake_synthesize_to_file( + self, + text: str, + *, + model: str, + voice: str, + instructions: str | None, + speed: float | None, + response_format: str, + output_path: str | Path, + ) -> Path: + path = Path(output_path) + path.write_bytes(b"fake-silk") + captured["response_format"] = response_format + return path + + monkeypatch.setattr(OpenAISpeechProvider, "synthesize_to_file", fake_synthesize_to_file) + + response = await loop._process_message( + InboundMessage( + channel="qq", + sender_id="user-1", + chat_id="chat-1", + content="hello", + ) + ) + + assert response is not None + assert response.content == "hello" + assert len(response.media) == 1 + assert Path(response.media[0]).suffix == ".silk" + assert captured["response_format"] == "silk"