Merge branch 'main' into feat/discord-support

2026-02-03 21:15:15 +05:30
parent bab464df5f d9d744d536
commit 7d2bebcfa3
15 changed files with 342 additions and 41 deletions
--- a/nanobot/agent/context.py
+++ b/nanobot/agent/context.py
@@ -1,5 +1,7 @@
 """Context builder for assembling agent prompts."""

+import base64
+import mimetypes
 from pathlib import Path
 from typing import Any

@@ -114,32 +116,53 @@ When remembering something, write to {workspace_path}/memory/MEMORY.md"""
        self,
        history: list[dict[str, Any]],
        current_message: str,
-        skill_names: list[str] | None = None
+        skill_names: list[str] | None = None,
+        media: list[str] | None = None,
    ) -> list[dict[str, Any]]:
        """
        Build the complete message list for an LLM call.
-        
+
        Args:
            history: Previous conversation messages.
            current_message: The new user message.
            skill_names: Optional skills to include.
-        
+            media: Optional list of local file paths for images/media.
+
        Returns:
            List of messages including system prompt.
        """
        messages = []
-        
+
        # System prompt
        system_prompt = self.build_system_prompt(skill_names)
        messages.append({"role": "system", "content": system_prompt})
-        
+
        # History
        messages.extend(history)
-        
-        # Current message
-        messages.append({"role": "user", "content": current_message})
-        
+
+        # Current message (with optional image attachments)
+        user_content = self._build_user_content(current_message, media)
+        messages.append({"role": "user", "content": user_content})
+
        return messages
+
+    def _build_user_content(self, text: str, media: list[str] | None) -> str | list[dict[str, Any]]:
+        """Build user message content with optional base64-encoded images."""
+        if not media:
+            return text
+        
+        images = []
+        for path in media:
+            p = Path(path)
+            mime, _ = mimetypes.guess_type(path)
+            if not p.is_file() or not mime or not mime.startswith("image/"):
+                continue
+            b64 = base64.b64encode(p.read_bytes()).decode()
+            images.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
+        
+        if not images:
+            return text
+        return images + [{"type": "text", "text": text}]
    
    def add_tool_result(
        self,
--- a/nanobot/agent/loop.py
+++ b/nanobot/agent/loop.py
@@ -152,7 +152,8 @@ class AgentLoop:
        # Build initial messages (use get_history for LLM-formatted messages)
        messages = self.context.build_messages(
            history=session.get_history(),
-            current_message=msg.content
+            current_message=msg.content,
+            media=msg.media if msg.media else None,
        )
        
        # Agent loop
--- a/nanobot/channels/manager.py
+++ b/nanobot/channels/manager.py
@@ -37,7 +37,9 @@ class ChannelManager:
            try:
                from nanobot.channels.telegram import TelegramChannel
                self.channels["telegram"] = TelegramChannel(
-                    self.config.channels.telegram, self.bus
+                    self.config.channels.telegram,
+                    self.bus,
+                    groq_api_key=self.config.providers.groq.api_key,
                )
                logger.info("Telegram channel enabled")
            except ImportError as e:
--- a/nanobot/channels/telegram.py
+++ b/nanobot/channels/telegram.py
@@ -85,9 +85,10 @@ class TelegramChannel(BaseChannel):
    
    name = "telegram"
    
-    def __init__(self, config: TelegramConfig, bus: MessageBus):
+    def __init__(self, config: TelegramConfig, bus: MessageBus, groq_api_key: str = ""):
        super().__init__(config, bus)
        self.config: TelegramConfig = config
+        self.groq_api_key = groq_api_key
        self._app: Application | None = None
        self._chat_ids: dict[str, int] = {}  # Map sender_id to chat_id for replies
    
@@ -249,7 +250,20 @@ class TelegramChannel(BaseChannel):
                await file.download_to_drive(str(file_path))
                
                media_paths.append(str(file_path))
-                content_parts.append(f"[{media_type}: {file_path}]")
+                
+                # Handle voice transcription
+                if media_type == "voice" or media_type == "audio":
+                    from nanobot.providers.transcription import GroqTranscriptionProvider
+                    transcriber = GroqTranscriptionProvider(api_key=self.groq_api_key)
+                    transcription = await transcriber.transcribe(file_path)
+                    if transcription:
+                        logger.info(f"Transcribed {media_type}: {transcription[:50]}...")
+                        content_parts.append(f"[transcription: {transcription}]")
+                    else:
+                        content_parts.append(f"[{media_type}: {file_path}]")
+                else:
+                    content_parts.append(f"[{media_type}: {file_path}]")
+                    
                logger.debug(f"Downloaded {media_type} to {file_path}")
            except Exception as e:
                logger.error(f"Failed to download media: {e}")
--- a/nanobot/channels/whatsapp.py
+++ b/nanobot/channels/whatsapp.py
@@ -107,6 +107,11 @@ class WhatsAppChannel(BaseChannel):
            # Extract just the phone number as chat_id
            chat_id = sender.split("@")[0] if "@" in sender else sender
            
+            # Handle voice transcription if it's a voice message
+            if content == "[Voice Message]":
+                logger.info(f"Voice message received from {chat_id}, but direct download from bridge is not yet supported.")
+                content = "[Voice Message: Transcription not available for WhatsApp yet]"
+            
            await self._handle_message(
                sender_id=chat_id,
                chat_id=sender,  # Use full JID for replies
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -178,11 +178,13 @@ def gateway(
    # Create components
    bus = MessageBus()
    
-    # Create provider (supports OpenRouter, Anthropic, OpenAI)
+    # Create provider (supports OpenRouter, Anthropic, OpenAI, Bedrock)
    api_key = config.get_api_key()
    api_base = config.get_api_base()
-    
-    if not api_key:
+    model = config.agents.defaults.model
+    is_bedrock = model.startswith("bedrock/")
+
+    if not api_key and not is_bedrock:
        console.print("[red]Error: No API key configured.[/red]")
        console.print("Set one in ~/.nanobot/config.json under providers.openrouter.apiKey")
        raise typer.Exit(1)
@@ -289,11 +291,13 @@ def agent(
    
    api_key = config.get_api_key()
    api_base = config.get_api_base()
-    
-    if not api_key:
+    model = config.agents.defaults.model
+    is_bedrock = model.startswith("bedrock/")
+
+    if not api_key and not is_bedrock:
        console.print("[red]Error: No API key configured.[/red]")
        raise typer.Exit(1)
-    
+
    bus = MessageBus()
    provider = LiteLLMProvider(
        api_key=api_key,
@@ -348,14 +352,15 @@ app.add_typer(channels_app, name="channels")
 def channels_status():
    """Show channel status."""
    from nanobot.config.loader import load_config
-    
+
    config = load_config()
-    
+
    table = Table(title="Channel Status")
    table.add_column("Channel", style="cyan")
    table.add_column("Enabled", style="green")
-    table.add_column("Bridge URL", style="yellow")
-    
+    table.add_column("Configuration", style="yellow")
+
+    # WhatsApp
    wa = config.channels.whatsapp
    table.add_row(
        "WhatsApp",
@@ -363,13 +368,6 @@ def channels_status():
        wa.bridge_url
    )

-    tg = config.channels.telegram
-    table.add_row(
-        "Telegram",
-        "✓" if tg.enabled else "✗",
-        "polling"
-    )
-
    dc = config.channels.discord
    table.add_row(
        "Discord",
@@ -377,6 +375,15 @@ def channels_status():
        dc.gateway_url
    )
    
+    # Telegram
+    tg = config.channels.telegram
+    tg_config = f"token: {tg.token[:10]}..." if tg.token else "[dim]not configured[/dim]"
+    table.add_row(
+        "Telegram",
+        "✓" if tg.enabled else "✗",
+        tg_config
+    )
+
    console.print(table)


@@ -520,6 +527,7 @@ def cron_add(
    at: str = typer.Option(None, "--at", help="Run once at time (ISO format)"),
    deliver: bool = typer.Option(False, "--deliver", "-d", help="Deliver response to channel"),
    to: str = typer.Option(None, "--to", help="Recipient for delivery"),
+    channel: str = typer.Option(None, "--channel", help="Channel for delivery (e.g. 'telegram', 'whatsapp')"),
 ):
    """Add a scheduled job."""
    from nanobot.config.loader import get_data_dir
@@ -548,6 +556,7 @@ def cron_add(
        message=message,
        deliver=deliver,
        to=to,
+        channel=channel,
    )
    
    console.print(f"[green]✓[/green] Added job '{job.name}' ({job.id})")
--- a/nanobot/config/schema.py
+++ b/nanobot/config/schema.py
@@ -60,6 +60,7 @@ class ProvidersConfig(BaseModel):
    anthropic: ProviderConfig = Field(default_factory=ProviderConfig)
    openai: ProviderConfig = Field(default_factory=ProviderConfig)
    openrouter: ProviderConfig = Field(default_factory=ProviderConfig)
+    groq: ProviderConfig = Field(default_factory=ProviderConfig)
    zhipu: ProviderConfig = Field(default_factory=ProviderConfig)
    vllm: ProviderConfig = Field(default_factory=ProviderConfig)
    gemini: ProviderConfig = Field(default_factory=ProviderConfig)
@@ -101,14 +102,14 @@ class Config(BaseSettings):
        return Path(self.agents.defaults.workspace).expanduser()
    
    def get_api_key(self) -> str | None:
-        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > vLLM."""
-        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > vLLM."""
+        """Get API key in priority order: OpenRouter > Anthropic > OpenAI > Gemini > Zhipu > Groq > vLLM."""
        return (
            self.providers.openrouter.api_key or
            self.providers.anthropic.api_key or
            self.providers.openai.api_key or
            self.providers.gemini.api_key or
            self.providers.zhipu.api_key or
+            self.providers.groq.api_key or
            self.providers.vllm.api_key or
            None
        )
--- a/nanobot/providers/litellm_provider.py
+++ b/nanobot/providers/litellm_provider.py
@@ -51,6 +51,8 @@ class LiteLLMProvider(LLMProvider):
                os.environ.setdefault("GEMINI_API_KEY", api_key)
            elif "zhipu" in default_model or "glm" in default_model or "zai" in default_model:
                os.environ.setdefault("ZHIPUAI_API_KEY", api_key)
+            elif "groq" in default_model:
+                os.environ.setdefault("GROQ_API_KEY", api_key)
        
        if api_base:
            litellm.api_base = api_base
--- a/nanobot/providers/transcription.py
+++ b/nanobot/providers/transcription.py
@@ -0,0 +1,65 @@
+"""Voice transcription provider using Groq."""
+
+import os
+from pathlib import Path
+from typing import Any
+
+import httpx
+from loguru import logger
+
+
+class GroqTranscriptionProvider:
+    """
+    Voice transcription provider using Groq's Whisper API.
+    
+    Groq offers extremely fast transcription with a generous free tier.
+    """
+    
+    def __init__(self, api_key: str | None = None):
+        self.api_key = api_key or os.environ.get("GROQ_API_KEY")
+        self.api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
+    
+    async def transcribe(self, file_path: str | Path) -> str:
+        """
+        Transcribe an audio file using Groq.
+        
+        Args:
+            file_path: Path to the audio file.
+            
+        Returns:
+            Transcribed text.
+        """
+        if not self.api_key:
+            logger.warning("Groq API key not configured for transcription")
+            return ""
+        
+        path = Path(file_path)
+        if not path.exists():
+            logger.error(f"Audio file not found: {file_path}")
+            return ""
+        
+        try:
+            async with httpx.AsyncClient() as client:
+                with open(path, "rb") as f:
+                    files = {
+                        "file": (path.name, f),
+                        "model": (None, "whisper-large-v3"),
+                    }
+                    headers = {
+                        "Authorization": f"Bearer {self.api_key}",
+                    }
+                    
+                    response = await client.post(
+                        self.api_url,
+                        headers=headers,
+                        files=files,
+                        timeout=60.0
+                    )
+                    
+                    response.raise_for_status()
+                    data = response.json()
+                    return data.get("text", "")
+                    
+        except Exception as e:
+            logger.error(f"Groq transcription error: {e}")
+            return ""