Merge branch 'main' into feat-volcengine-tuning

2026-03-12 14:56:05 +08:00
parent b3710165c0 c38579dc22
commit 8865b6848c
77 changed files with 6996 additions and 1260 deletions
--- a/nanobot/providers/init.py
+++ b/nanobot/providers/init.py
@@ -3,5 +3,6 @@
 from nanobot.providers.base import LLMProvider, LLMResponse
 from nanobot.providers.litellm_provider import LiteLLMProvider
 from nanobot.providers.openai_codex_provider import OpenAICodexProvider
+from nanobot.providers.azure_openai_provider import AzureOpenAIProvider

-__all__ = ["LLMProvider", "LLMResponse", "LiteLLMProvider", "OpenAICodexProvider"]
+__all__ = ["LLMProvider", "LLMResponse", "LiteLLMProvider", "OpenAICodexProvider", "AzureOpenAIProvider"]
--- a/nanobot/providers/azure_openai_provider.py
+++ b/nanobot/providers/azure_openai_provider.py
@@ -0,0 +1,213 @@
+"""Azure OpenAI provider implementation with API version 2024-10-21."""
+
+from __future__ import annotations
+
+import uuid
+from typing import Any
+from urllib.parse import urljoin
+
+import httpx
+import json_repair
+
+from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
+
+_AZURE_MSG_KEYS = frozenset({"role", "content", "tool_calls", "tool_call_id", "name"})
+
+
+class AzureOpenAIProvider(LLMProvider):
+    """
+    Azure OpenAI provider with API version 2024-10-21 compliance.
+    
+    Features:
+    - Hardcoded API version 2024-10-21
+    - Uses model field as Azure deployment name in URL path
+    - Uses api-key header instead of Authorization Bearer
+    - Uses max_completion_tokens instead of max_tokens
+    - Direct HTTP calls, bypasses LiteLLM
+    """
+
+    def __init__(
+        self,
+        api_key: str = "",
+        api_base: str = "",
+        default_model: str = "gpt-5.2-chat",
+    ):
+        super().__init__(api_key, api_base)
+        self.default_model = default_model
+        self.api_version = "2024-10-21"
+        
+        # Validate required parameters
+        if not api_key:
+            raise ValueError("Azure OpenAI api_key is required")
+        if not api_base:
+            raise ValueError("Azure OpenAI api_base is required")
+        
+        # Ensure api_base ends with /
+        if not api_base.endswith('/'):
+            api_base += '/'
+        self.api_base = api_base
+
+    def _build_chat_url(self, deployment_name: str) -> str:
+        """Build the Azure OpenAI chat completions URL."""
+        # Azure OpenAI URL format:
+        # https://{resource}.openai.azure.com/openai/deployments/{deployment}/chat/completions?api-version={version}
+        base_url = self.api_base
+        if not base_url.endswith('/'):
+            base_url += '/'
+        
+        url = urljoin(
+            base_url, 
+            f"openai/deployments/{deployment_name}/chat/completions"
+        )
+        return f"{url}?api-version={self.api_version}"
+
+    def _build_headers(self) -> dict[str, str]:
+        """Build headers for Azure OpenAI API with api-key header."""
+        return {
+            "Content-Type": "application/json",
+            "api-key": self.api_key,  # Azure OpenAI uses api-key header, not Authorization
+            "x-session-affinity": uuid.uuid4().hex,  # For cache locality
+        }
+
+    @staticmethod
+    def _supports_temperature(
+        deployment_name: str,
+        reasoning_effort: str | None = None,
+    ) -> bool:
+        """Return True when temperature is likely supported for this deployment."""
+        if reasoning_effort:
+            return False
+        name = deployment_name.lower()
+        return not any(token in name for token in ("gpt-5", "o1", "o3", "o4"))
+
+    def _prepare_request_payload(
+        self,
+        deployment_name: str,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]] | None = None,
+        max_tokens: int = 4096,
+        temperature: float = 0.7,
+        reasoning_effort: str | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        """Prepare the request payload with Azure OpenAI 2024-10-21 compliance."""
+        payload: dict[str, Any] = {
+            "messages": self._sanitize_request_messages(
+                self._sanitize_empty_content(messages),
+                _AZURE_MSG_KEYS,
+            ),
+            "max_completion_tokens": max(1, max_tokens),  # Azure API 2024-10-21 uses max_completion_tokens
+        }
+
+        if self._supports_temperature(deployment_name, reasoning_effort):
+            payload["temperature"] = temperature
+
+        if reasoning_effort:
+            payload["reasoning_effort"] = reasoning_effort
+
+        if tools:
+            payload["tools"] = tools
+            payload["tool_choice"] = tool_choice or "auto"
+
+        return payload
+
+    async def chat(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]] | None = None,
+        model: str | None = None,
+        max_tokens: int = 4096,
+        temperature: float = 0.7,
+        reasoning_effort: str | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+    ) -> LLMResponse:
+        """
+        Send a chat completion request to Azure OpenAI.
+
+        Args:
+            messages: List of message dicts with 'role' and 'content'.
+            tools: Optional list of tool definitions in OpenAI format.
+            model: Model identifier (used as deployment name).
+            max_tokens: Maximum tokens in response (mapped to max_completion_tokens).
+            temperature: Sampling temperature.
+            reasoning_effort: Optional reasoning effort parameter.
+
+        Returns:
+            LLMResponse with content and/or tool calls.
+        """
+        deployment_name = model or self.default_model
+        url = self._build_chat_url(deployment_name)
+        headers = self._build_headers()
+        payload = self._prepare_request_payload(
+            deployment_name, messages, tools, max_tokens, temperature, reasoning_effort,
+            tool_choice=tool_choice,
+        )
+
+        try:
+            async with httpx.AsyncClient(timeout=60.0, verify=True) as client:
+                response = await client.post(url, headers=headers, json=payload)
+                if response.status_code != 200:
+                    return LLMResponse(
+                        content=f"Azure OpenAI API Error {response.status_code}: {response.text}",
+                        finish_reason="error",
+                    )
+                
+                response_data = response.json()
+                return self._parse_response(response_data)
+
+        except Exception as e:
+            return LLMResponse(
+                content=f"Error calling Azure OpenAI: {repr(e)}",
+                finish_reason="error",
+            )
+
+    def _parse_response(self, response: dict[str, Any]) -> LLMResponse:
+        """Parse Azure OpenAI response into our standard format."""
+        try:
+            choice = response["choices"][0]
+            message = choice["message"]
+
+            tool_calls = []
+            if message.get("tool_calls"):
+                for tc in message["tool_calls"]:
+                    # Parse arguments from JSON string if needed
+                    args = tc["function"]["arguments"]
+                    if isinstance(args, str):
+                        args = json_repair.loads(args)
+
+                    tool_calls.append(
+                        ToolCallRequest(
+                            id=tc["id"],
+                            name=tc["function"]["name"],
+                            arguments=args,
+                        )
+                    )
+
+            usage = {}
+            if response.get("usage"):
+                usage_data = response["usage"]
+                usage = {
+                    "prompt_tokens": usage_data.get("prompt_tokens", 0),
+                    "completion_tokens": usage_data.get("completion_tokens", 0),
+                    "total_tokens": usage_data.get("total_tokens", 0),
+                }
+
+            reasoning_content = message.get("reasoning_content") or None
+
+            return LLMResponse(
+                content=message.get("content"),
+                tool_calls=tool_calls,
+                finish_reason=choice.get("finish_reason", "stop"),
+                usage=usage,
+                reasoning_content=reasoning_content,
+            )
+
+        except (KeyError, IndexError) as e:
+            return LLMResponse(
+                content=f"Error parsing Azure OpenAI response: {str(e)}",
+                finish_reason="error",
+            )
+
+    def get_default_model(self) -> str:
+        """Get the default model (also used as default deployment name)."""
+        return self.default_model
--- a/nanobot/providers/base.py
+++ b/nanobot/providers/base.py
@@ -1,9 +1,13 @@
 """Base LLM provider interface."""

+import asyncio
+import json
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any

+from loguru import logger
+

@dataclass
 class ToolCallRequest:
@@ -11,6 +15,24 @@ class ToolCallRequest:
    id: str
    name: str
    arguments: dict[str, Any]
+    provider_specific_fields: dict[str, Any] | None = None
+    function_provider_specific_fields: dict[str, Any] | None = None
+
+    def to_openai_tool_call(self) -> dict[str, Any]:
+        """Serialize to an OpenAI-style tool_call payload."""
+        tool_call = {
+            "id": self.id,
+            "type": "function",
+            "function": {
+                "name": self.name,
+                "arguments": json.dumps(self.arguments, ensure_ascii=False),
+            },
+        }
+        if self.provider_specific_fields:
+            tool_call["provider_specific_fields"] = self.provider_specific_fields
+        if self.function_provider_specific_fields:
+            tool_call["function"]["provider_specific_fields"] = self.function_provider_specific_fields
+        return tool_call


@dataclass
@@ -29,6 +51,21 @@ class LLMResponse:
        return len(self.tool_calls) > 0


+@dataclass(frozen=True)
+class GenerationSettings:
+    """Default generation parameters for LLM calls.
+
+    Stored on the provider so every call site inherits the same defaults
+    without having to pass temperature / max_tokens / reasoning_effort
+    through every layer.  Individual call sites can still override by
+    passing explicit keyword arguments to chat() / chat_with_retry().
+    """
+
+    temperature: float = 0.7
+    max_tokens: int = 4096
+    reasoning_effort: str | None = None
+
+
 class LLMProvider(ABC):
    """
    Abstract base class for LLM providers.
@@ -37,9 +74,28 @@ class LLMProvider(ABC):
    while maintaining a consistent interface.
    """

+    _CHAT_RETRY_DELAYS = (1, 2, 4)
+    _TRANSIENT_ERROR_MARKERS = (
+        "429",
+        "rate limit",
+        "500",
+        "502",
+        "503",
+        "504",
+        "overloaded",
+        "timeout",
+        "timed out",
+        "connection",
+        "server error",
+        "temporarily unavailable",
+    )
+
+    _SENTINEL = object()
+
    def __init__(self, api_key: str | None = None, api_base: str | None = None):
        self.api_key = api_key
        self.api_base = api_base
+        self.generation: GenerationSettings = GenerationSettings()

    @staticmethod
    def _sanitize_empty_content(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
@@ -87,6 +143,20 @@ class LLMProvider(ABC):
            result.append(msg)
        return result

+    @staticmethod
+    def _sanitize_request_messages(
+        messages: list[dict[str, Any]],
+        allowed_keys: frozenset[str],
+    ) -> list[dict[str, Any]]:
+        """Keep only provider-safe message keys and normalize assistant content."""
+        sanitized = []
+        for msg in messages:
+            clean = {k: v for k, v in msg.items() if k in allowed_keys}
+            if clean.get("role") == "assistant" and "content" not in clean:
+                clean["content"] = None
+            sanitized.append(clean)
+        return sanitized
+
    @abstractmethod
    async def chat(
        self,
@@ -96,6 +166,7 @@ class LLMProvider(ABC):
        max_tokens: int = 4096,
        temperature: float = 0.7,
        reasoning_effort: str | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
    ) -> LLMResponse:
        """
        Send a chat completion request.
@@ -106,12 +177,93 @@ class LLMProvider(ABC):
            model: Model identifier (provider-specific).
            max_tokens: Maximum tokens in response.
            temperature: Sampling temperature.
+            tool_choice: Tool selection strategy ("auto", "required", or specific tool dict).
        
        Returns:
            LLMResponse with content and/or tool calls.
        """
        pass

+    @classmethod
+    def _is_transient_error(cls, content: str | None) -> bool:
+        err = (content or "").lower()
+        return any(marker in err for marker in cls._TRANSIENT_ERROR_MARKERS)
+
+    async def chat_with_retry(
+        self,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]] | None = None,
+        model: str | None = None,
+        max_tokens: object = _SENTINEL,
+        temperature: object = _SENTINEL,
+        reasoning_effort: object = _SENTINEL,
+        tool_choice: str | dict[str, Any] | None = None,
+    ) -> LLMResponse:
+        """Call chat() with retry on transient provider failures.
+
+        Parameters default to ``self.generation`` when not explicitly passed,
+        so callers no longer need to thread temperature / max_tokens /
+        reasoning_effort through every layer.
+        """
+        if max_tokens is self._SENTINEL:
+            max_tokens = self.generation.max_tokens
+        if temperature is self._SENTINEL:
+            temperature = self.generation.temperature
+        if reasoning_effort is self._SENTINEL:
+            reasoning_effort = self.generation.reasoning_effort
+
+        for attempt, delay in enumerate(self._CHAT_RETRY_DELAYS, start=1):
+            try:
+                response = await self.chat(
+                    messages=messages,
+                    tools=tools,
+                    model=model,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    reasoning_effort=reasoning_effort,
+                    tool_choice=tool_choice,
+                )
+            except asyncio.CancelledError:
+                raise
+            except Exception as exc:
+                response = LLMResponse(
+                    content=f"Error calling LLM: {exc}",
+                    finish_reason="error",
+                )
+
+            if response.finish_reason != "error":
+                return response
+            if not self._is_transient_error(response.content):
+                return response
+
+            err = (response.content or "").lower()
+            logger.warning(
+                "LLM transient error (attempt {}/{}), retrying in {}s: {}",
+                attempt,
+                len(self._CHAT_RETRY_DELAYS),
+                delay,
+                err[:120],
+            )
+            await asyncio.sleep(delay)
+
+        try:
+            return await self.chat(
+                messages=messages,
+                tools=tools,
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                reasoning_effort=reasoning_effort,
+                tool_choice=tool_choice,
+            )
+        except asyncio.CancelledError:
+            raise
+        except Exception as exc:
+            return LLMResponse(
+                content=f"Error calling LLM: {exc}",
+                finish_reason="error",
+            )
+
    @abstractmethod
    def get_default_model(self) -> str:
        """Get the default model for this provider."""
--- a/nanobot/providers/custom_provider.py
+++ b/nanobot/providers/custom_provider.py
@@ -25,7 +25,8 @@ class CustomProvider(LLMProvider):

    async def chat(self, messages: list[dict[str, Any]], tools: list[dict[str, Any]] | None = None,
                   model: str | None = None, max_tokens: int = 4096, temperature: float = 0.7,
-                   reasoning_effort: str | None = None) -> LLMResponse:
+                   reasoning_effort: str | None = None,
+                   tool_choice: str | dict[str, Any] | None = None) -> LLMResponse:
        kwargs: dict[str, Any] = {
            "model": model or self.default_model,
            "messages": self._sanitize_empty_content(messages),
@@ -35,7 +36,7 @@ class CustomProvider(LLMProvider):
        if reasoning_effort:
            kwargs["reasoning_effort"] = reasoning_effort
        if tools:
-            kwargs.update(tools=tools, tool_choice="auto")
+            kwargs.update(tools=tools, tool_choice=tool_choice or "auto")
        try:
            return self._parse(await self._client.chat.completions.create(**kwargs))
        except Exception as e:
--- a/nanobot/providers/litellm_provider.py
+++ b/nanobot/providers/litellm_provider.py
@@ -1,5 +1,6 @@
 """LiteLLM provider implementation for multi-provider support."""

+import hashlib
 import os
 import secrets
 import string
@@ -8,6 +9,7 @@ from typing import Any
 import json_repair
 import litellm
 from litellm import acompletion
+from loguru import logger

 from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
 from nanobot.providers.registry import find_by_model, find_gateway
@@ -165,17 +167,43 @@ class LiteLLMProvider(LLMProvider):
            return _ANTHROPIC_EXTRA_KEYS
        return frozenset()

+    @staticmethod
+    def _normalize_tool_call_id(tool_call_id: Any) -> Any:
+        """Normalize tool_call_id to a provider-safe 9-char alphanumeric form."""
+        if not isinstance(tool_call_id, str):
+            return tool_call_id
+        if len(tool_call_id) == 9 and tool_call_id.isalnum():
+            return tool_call_id
+        return hashlib.sha1(tool_call_id.encode()).hexdigest()[:9]
+
    @staticmethod
    def _sanitize_messages(messages: list[dict[str, Any]], extra_keys: frozenset[str] = frozenset()) -> list[dict[str, Any]]:
        """Strip non-standard keys and ensure assistant messages have a content key."""
        allowed = _ALLOWED_MSG_KEYS | extra_keys
-        sanitized = []
-        for msg in messages:
-            clean = {k: v for k, v in msg.items() if k in allowed}
-            # Strict providers require "content" even when assistant only has tool_calls
-            if clean.get("role") == "assistant" and "content" not in clean:
-                clean["content"] = None
-            sanitized.append(clean)
+        sanitized = LLMProvider._sanitize_request_messages(messages, allowed)
+        id_map: dict[str, str] = {}
+
+        def map_id(value: Any) -> Any:
+            if not isinstance(value, str):
+                return value
+            return id_map.setdefault(value, LiteLLMProvider._normalize_tool_call_id(value))
+
+        for clean in sanitized:
+            # Keep assistant tool_calls[].id and tool tool_call_id in sync after
+            # shortening, otherwise strict providers reject the broken linkage.
+            if isinstance(clean.get("tool_calls"), list):
+                normalized_tool_calls = []
+                for tc in clean["tool_calls"]:
+                    if not isinstance(tc, dict):
+                        normalized_tool_calls.append(tc)
+                        continue
+                    tc_clean = dict(tc)
+                    tc_clean["id"] = map_id(tc_clean.get("id"))
+                    normalized_tool_calls.append(tc_clean)
+                clean["tool_calls"] = normalized_tool_calls
+
+            if "tool_call_id" in clean and clean["tool_call_id"]:
+                clean["tool_call_id"] = map_id(clean["tool_call_id"])
        return sanitized

    async def chat(
@@ -186,6 +214,7 @@ class LiteLLMProvider(LLMProvider):
        max_tokens: int = 4096,
        temperature: float = 0.7,
        reasoning_effort: str | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
    ) -> LLMResponse:
        """
        Send a chat completion request via LiteLLM.
@@ -239,7 +268,7 @@ class LiteLLMProvider(LLMProvider):
        
        if tools:
            kwargs["tools"] = tools
-            kwargs["tool_choice"] = "auto"
+            kwargs["tool_choice"] = tool_choice or "auto"

        try:
            response = await acompletion(**kwargs)
@@ -255,20 +284,44 @@ class LiteLLMProvider(LLMProvider):
        """Parse LiteLLM response into our standard format."""
        choice = response.choices[0]
        message = choice.message
+        content = message.content
+        finish_reason = choice.finish_reason
+
+        # Some providers (e.g. GitHub Copilot) split content and tool_calls
+        # across multiple choices. Merge them so tool_calls are not lost.
+        raw_tool_calls = []
+        for ch in response.choices:
+            msg = ch.message
+            if hasattr(msg, "tool_calls") and msg.tool_calls:
+                raw_tool_calls.extend(msg.tool_calls)
+                if ch.finish_reason in ("tool_calls", "stop"):
+                    finish_reason = ch.finish_reason
+            if not content and msg.content:
+                content = msg.content
+
+        if len(response.choices) > 1:
+            logger.debug("LiteLLM response has {} choices, merged {} tool_calls",
+                         len(response.choices), len(raw_tool_calls))

        tool_calls = []
-        if hasattr(message, "tool_calls") and message.tool_calls:
-            for tc in message.tool_calls:
-                # Parse arguments from JSON string if needed
-                args = tc.function.arguments
-                if isinstance(args, str):
-                    args = json_repair.loads(args)
+        for tc in raw_tool_calls:
+            # Parse arguments from JSON string if needed
+            args = tc.function.arguments
+            if isinstance(args, str):
+                args = json_repair.loads(args)

-                tool_calls.append(ToolCallRequest(
-                    id=_short_tool_id(),
-                    name=tc.function.name,
-                    arguments=args,
-                ))
+            provider_specific_fields = getattr(tc, "provider_specific_fields", None) or None
+            function_provider_specific_fields = (
+                getattr(tc.function, "provider_specific_fields", None) or None
+            )
+
+            tool_calls.append(ToolCallRequest(
+                id=_short_tool_id(),
+                name=tc.function.name,
+                arguments=args,
+                provider_specific_fields=provider_specific_fields,
+                function_provider_specific_fields=function_provider_specific_fields,
+            ))

        usage = {}
        if hasattr(response, "usage") and response.usage:
@@ -280,11 +333,11 @@ class LiteLLMProvider(LLMProvider):

        reasoning_content = getattr(message, "reasoning_content", None) or None
        thinking_blocks = getattr(message, "thinking_blocks", None) or None
-        
+
        return LLMResponse(
-            content=message.content,
+            content=content,
            tool_calls=tool_calls,
-            finish_reason=choice.finish_reason or "stop",
+            finish_reason=finish_reason or "stop",
            usage=usage,
            reasoning_content=reasoning_content,
            thinking_blocks=thinking_blocks,
--- a/nanobot/providers/openai_codex_provider.py
+++ b/nanobot/providers/openai_codex_provider.py
@@ -32,6 +32,7 @@ class OpenAICodexProvider(LLMProvider):
        max_tokens: int = 4096,
        temperature: float = 0.7,
        reasoning_effort: str | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
    ) -> LLMResponse:
        model = model or self.default_model
        system_prompt, input_items = _convert_messages(messages)
@@ -48,7 +49,7 @@ class OpenAICodexProvider(LLMProvider):
            "text": {"verbosity": "medium"},
            "include": ["reasoning.encrypted_content"],
            "prompt_cache_key": _prompt_cache_key(messages),
-            "tool_choice": "auto",
+            "tool_choice": tool_choice or "auto",
            "parallel_tool_calls": True,
        }

--- a/nanobot/providers/registry.py
+++ b/nanobot/providers/registry.py
@@ -26,33 +26,33 @@ class ProviderSpec:
    """

    # identity
-    name: str                       # config field name, e.g. "dashscope"
-    keywords: tuple[str, ...]       # model-name keywords for matching (lowercase)
-    env_key: str                    # LiteLLM env var, e.g. "DASHSCOPE_API_KEY"
-    display_name: str = ""          # shown in `nanobot status`
+    name: str  # config field name, e.g. "dashscope"
+    keywords: tuple[str, ...]  # model-name keywords for matching (lowercase)
+    env_key: str  # LiteLLM env var, e.g. "DASHSCOPE_API_KEY"
+    display_name: str = ""  # shown in `nanobot status`

    # model prefixing
-    litellm_prefix: str = ""                 # "dashscope" → model becomes "dashscope/{model}"
-    skip_prefixes: tuple[str, ...] = ()      # don't prefix if model already starts with these
+    litellm_prefix: str = ""  # "dashscope" → model becomes "dashscope/{model}"
+    skip_prefixes: tuple[str, ...] = ()  # don't prefix if model already starts with these

    # extra env vars, e.g. (("ZHIPUAI_API_KEY", "{api_key}"),)
    env_extras: tuple[tuple[str, str], ...] = ()

    # gateway / local detection
-    is_gateway: bool = False                 # routes any model (OpenRouter, AiHubMix)
-    is_local: bool = False                   # local deployment (vLLM, Ollama)
-    detect_by_key_prefix: str = ""           # match api_key prefix, e.g. "sk-or-"
-    detect_by_base_keyword: str = ""         # match substring in api_base URL
-    default_api_base: str = ""               # fallback base URL
+    is_gateway: bool = False  # routes any model (OpenRouter, AiHubMix)
+    is_local: bool = False  # local deployment (vLLM, Ollama)
+    detect_by_key_prefix: str = ""  # match api_key prefix, e.g. "sk-or-"
+    detect_by_base_keyword: str = ""  # match substring in api_base URL
+    default_api_base: str = ""  # fallback base URL

    # gateway behavior
-    strip_model_prefix: bool = False         # strip "provider/" before re-prefixing
+    strip_model_prefix: bool = False  # strip "provider/" before re-prefixing

    # per-model param overrides, e.g. (("kimi-k2.5", {"temperature": 1.0}),)
    model_overrides: tuple[tuple[str, dict[str, Any]], ...] = ()

    # OAuth-based providers (e.g., OpenAI Codex) don't use API keys
-    is_oauth: bool = False                   # if True, uses OAuth flow instead of API key
+    is_oauth: bool = False  # if True, uses OAuth flow instead of API key

    # Direct providers bypass LiteLLM entirely (e.g., CustomProvider)
    is_direct: bool = False
@@ -70,7 +70,6 @@ class ProviderSpec:
 # ---------------------------------------------------------------------------

 PROVIDERS: tuple[ProviderSpec, ...] = (
-
    # === Custom (direct OpenAI-compatible endpoint, bypasses LiteLLM) ======
    ProviderSpec(
        name="custom",
@@ -81,16 +80,24 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        is_direct=True,
    ),

+    # === Azure OpenAI (direct API calls with API version 2024-10-21) =====
+    ProviderSpec(
+        name="azure_openai",
+        keywords=("azure", "azure-openai"),
+        env_key="",
+        display_name="Azure OpenAI",
+        litellm_prefix="",
+        is_direct=True,
+    ),
    # === Gateways (detected by api_key / api_base, not model name) =========
    # Gateways can route any model, so they win in fallback.
-
    # OpenRouter: global gateway, keys start with "sk-or-"
    ProviderSpec(
        name="openrouter",
        keywords=("openrouter",),
        env_key="OPENROUTER_API_KEY",
        display_name="OpenRouter",
-        litellm_prefix="openrouter",        # claude-3 → openrouter/claude-3
+        litellm_prefix="openrouter",  # claude-3 → openrouter/claude-3
        skip_prefixes=(),
        env_extras=(),
        is_gateway=True,
@@ -102,16 +109,15 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        model_overrides=(),
        supports_prompt_caching=True,
    ),
-
    # AiHubMix: global gateway, OpenAI-compatible interface.
    # strip_model_prefix=True: it doesn't understand "anthropic/claude-3",
    # so we strip to bare "claude-3" then re-prefix as "openai/claude-3".
    ProviderSpec(
        name="aihubmix",
        keywords=("aihubmix",),
-        env_key="OPENAI_API_KEY",           # OpenAI-compatible
+        env_key="OPENAI_API_KEY",  # OpenAI-compatible
        display_name="AiHubMix",
-        litellm_prefix="openai",            # → openai/{model}
+        litellm_prefix="openai",  # → openai/{model}
        skip_prefixes=(),
        env_extras=(),
        is_gateway=True,
@@ -119,10 +125,9 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        detect_by_key_prefix="",
        detect_by_base_keyword="aihubmix",
        default_api_base="https://aihubmix.com/v1",
-        strip_model_prefix=True,            # anthropic/claude-3 → claude-3 → openai/claude-3
+        strip_model_prefix=True,  # anthropic/claude-3 → claude-3 → openai/claude-3
        model_overrides=(),
    ),
-
    # SiliconFlow (硅基流动): OpenAI-compatible gateway, model names keep org prefix
    ProviderSpec(
        name="siliconflow",
@@ -213,8 +218,8 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        model_overrides=(),
    ),

-    # === Standard providers (matched by model-name keywords) ===============

+    # === Standard providers (matched by model-name keywords) ===============
    # Anthropic: LiteLLM recognizes "claude-*" natively, no prefix needed.
    ProviderSpec(
        name="anthropic",
@@ -233,7 +238,6 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        model_overrides=(),
        supports_prompt_caching=True,
    ),
-
    # OpenAI: LiteLLM recognizes "gpt-*" natively, no prefix needed.
    ProviderSpec(
        name="openai",
@@ -251,14 +255,13 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # OpenAI Codex: uses OAuth, not API key.
    ProviderSpec(
        name="openai_codex",
        keywords=("openai-codex",),
-        env_key="",                         # OAuth-based, no API key
+        env_key="",  # OAuth-based, no API key
        display_name="OpenAI Codex",
-        litellm_prefix="",                  # Not routed through LiteLLM
+        litellm_prefix="",  # Not routed through LiteLLM
        skip_prefixes=(),
        env_extras=(),
        is_gateway=False,
@@ -268,16 +271,15 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        default_api_base="https://chatgpt.com/backend-api",
        strip_model_prefix=False,
        model_overrides=(),
-        is_oauth=True,                      # OAuth-based authentication
+        is_oauth=True,  # OAuth-based authentication
    ),
-
    # Github Copilot: uses OAuth, not API key.
    ProviderSpec(
        name="github_copilot",
        keywords=("github_copilot", "copilot"),
-        env_key="",                         # OAuth-based, no API key
+        env_key="",  # OAuth-based, no API key
        display_name="Github Copilot",
-        litellm_prefix="github_copilot",   # github_copilot/model → github_copilot/model
+        litellm_prefix="github_copilot",  # github_copilot/model → github_copilot/model
        skip_prefixes=("github_copilot/",),
        env_extras=(),
        is_gateway=False,
@@ -287,17 +289,16 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        default_api_base="",
        strip_model_prefix=False,
        model_overrides=(),
-        is_oauth=True,                      # OAuth-based authentication
+        is_oauth=True,  # OAuth-based authentication
    ),
-
    # DeepSeek: needs "deepseek/" prefix for LiteLLM routing.
    ProviderSpec(
        name="deepseek",
        keywords=("deepseek",),
        env_key="DEEPSEEK_API_KEY",
        display_name="DeepSeek",
-        litellm_prefix="deepseek",          # deepseek-chat → deepseek/deepseek-chat
-        skip_prefixes=("deepseek/",),       # avoid double-prefix
+        litellm_prefix="deepseek",  # deepseek-chat → deepseek/deepseek-chat
+        skip_prefixes=("deepseek/",),  # avoid double-prefix
        env_extras=(),
        is_gateway=False,
        is_local=False,
@@ -307,15 +308,14 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # Gemini: needs "gemini/" prefix for LiteLLM.
    ProviderSpec(
        name="gemini",
        keywords=("gemini",),
        env_key="GEMINI_API_KEY",
        display_name="Gemini",
-        litellm_prefix="gemini",            # gemini-pro → gemini/gemini-pro
-        skip_prefixes=("gemini/",),         # avoid double-prefix
+        litellm_prefix="gemini",  # gemini-pro → gemini/gemini-pro
+        skip_prefixes=("gemini/",),  # avoid double-prefix
        env_extras=(),
        is_gateway=False,
        is_local=False,
@@ -325,7 +325,6 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # Zhipu: LiteLLM uses "zai/" prefix.
    # Also mirrors key to ZHIPUAI_API_KEY (some LiteLLM paths check that).
    # skip_prefixes: don't add "zai/" when already routed via gateway.
@@ -334,11 +333,9 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        keywords=("zhipu", "glm", "zai"),
        env_key="ZAI_API_KEY",
        display_name="Zhipu AI",
-        litellm_prefix="zai",              # glm-4 → zai/glm-4
+        litellm_prefix="zai",  # glm-4 → zai/glm-4
        skip_prefixes=("zhipu/", "zai/", "openrouter/", "hosted_vllm/"),
-        env_extras=(
-            ("ZHIPUAI_API_KEY", "{api_key}"),
-        ),
+        env_extras=(("ZHIPUAI_API_KEY", "{api_key}"),),
        is_gateway=False,
        is_local=False,
        detect_by_key_prefix="",
@@ -347,14 +344,13 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # DashScope: Qwen models, needs "dashscope/" prefix.
    ProviderSpec(
        name="dashscope",
        keywords=("qwen", "dashscope"),
        env_key="DASHSCOPE_API_KEY",
        display_name="DashScope",
-        litellm_prefix="dashscope",         # qwen-max → dashscope/qwen-max
+        litellm_prefix="dashscope",  # qwen-max → dashscope/qwen-max
        skip_prefixes=("dashscope/", "openrouter/"),
        env_extras=(),
        is_gateway=False,
@@ -365,7 +361,6 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # Moonshot: Kimi models, needs "moonshot/" prefix.
    # LiteLLM requires MOONSHOT_API_BASE env var to find the endpoint.
    # Kimi K2.5 API enforces temperature >= 1.0.
@@ -374,22 +369,17 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        keywords=("moonshot", "kimi"),
        env_key="MOONSHOT_API_KEY",
        display_name="Moonshot",
-        litellm_prefix="moonshot",          # kimi-k2.5 → moonshot/kimi-k2.5
+        litellm_prefix="moonshot",  # kimi-k2.5 → moonshot/kimi-k2.5
        skip_prefixes=("moonshot/", "openrouter/"),
-        env_extras=(
-            ("MOONSHOT_API_BASE", "{api_base}"),
-        ),
+        env_extras=(("MOONSHOT_API_BASE", "{api_base}"),),
        is_gateway=False,
        is_local=False,
        detect_by_key_prefix="",
        detect_by_base_keyword="",
-        default_api_base="https://api.moonshot.ai/v1",   # intl; use api.moonshot.cn for China
+        default_api_base="https://api.moonshot.ai/v1",  # intl; use api.moonshot.cn for China
        strip_model_prefix=False,
-        model_overrides=(
-            ("kimi-k2.5", {"temperature": 1.0}),
-        ),
+        model_overrides=(("kimi-k2.5", {"temperature": 1.0}),),
    ),
-
    # MiniMax: needs "minimax/" prefix for LiteLLM routing.
    # Uses OpenAI-compatible API at api.minimax.io/v1.
    ProviderSpec(
@@ -397,7 +387,7 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        keywords=("minimax",),
        env_key="MINIMAX_API_KEY",
        display_name="MiniMax",
-        litellm_prefix="minimax",            # MiniMax-M2.1 → minimax/MiniMax-M2.1
+        litellm_prefix="minimax",  # MiniMax-M2.1 → minimax/MiniMax-M2.1
        skip_prefixes=("minimax/", "openrouter/"),
        env_extras=(),
        is_gateway=False,
@@ -408,9 +398,7 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # === Local deployment (matched by config key, NOT by api_base) =========
-
    # vLLM / any OpenAI-compatible local server.
    # Detected when config key is "vllm" (provider_name="vllm").
    ProviderSpec(
@@ -418,20 +406,35 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        keywords=("vllm",),
        env_key="HOSTED_VLLM_API_KEY",
        display_name="vLLM/Local",
-        litellm_prefix="hosted_vllm",      # Llama-3-8B → hosted_vllm/Llama-3-8B
+        litellm_prefix="hosted_vllm",  # Llama-3-8B → hosted_vllm/Llama-3-8B
        skip_prefixes=(),
        env_extras=(),
        is_gateway=False,
        is_local=True,
        detect_by_key_prefix="",
        detect_by_base_keyword="",
-        default_api_base="",                # user must provide in config
+        default_api_base="",  # user must provide in config
+        strip_model_prefix=False,
+        model_overrides=(),
+    ),
+    # === Ollama (local, OpenAI-compatible) ===================================
+    ProviderSpec(
+        name="ollama",
+        keywords=("ollama", "nemotron"),
+        env_key="OLLAMA_API_KEY",
+        display_name="Ollama",
+        litellm_prefix="ollama_chat",  # model → ollama_chat/model
+        skip_prefixes=("ollama/", "ollama_chat/"),
+        env_extras=(),
+        is_gateway=False,
+        is_local=True,
+        detect_by_key_prefix="",
+        detect_by_base_keyword="11434",
+        default_api_base="http://localhost:11434",
        strip_model_prefix=False,
        model_overrides=(),
    ),
-
    # === Auxiliary (not a primary LLM provider) ============================
-
    # Groq: mainly used for Whisper voice transcription, also usable for LLM.
    # Needs "groq/" prefix for LiteLLM routing. Placed last — it rarely wins fallback.
    ProviderSpec(
@@ -439,8 +442,8 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
        keywords=("groq",),
        env_key="GROQ_API_KEY",
        display_name="Groq",
-        litellm_prefix="groq",              # llama3-8b-8192 → groq/llama3-8b-8192
-        skip_prefixes=("groq/",),           # avoid double-prefix
+        litellm_prefix="groq",  # llama3-8b-8192 → groq/llama3-8b-8192
+        skip_prefixes=("groq/",),  # avoid double-prefix
        env_extras=(),
        is_gateway=False,
        is_local=False,
@@ -457,6 +460,7 @@ PROVIDERS: tuple[ProviderSpec, ...] = (
 # Lookup helpers
 # ---------------------------------------------------------------------------

+
 def find_by_model(model: str) -> ProviderSpec | None:
    """Match a standard provider by model-name keyword (case-insensitive).
    Skips gateways/local — those are matched by api_key/api_base instead."""
@@ -472,7 +476,9 @@ def find_by_model(model: str) -> ProviderSpec | None:
            return spec

    for spec in std_specs:
-        if any(kw in model_lower or kw.replace("-", "_") in model_normalized for kw in spec.keywords):
+        if any(
+            kw in model_lower or kw.replace("-", "_") in model_normalized for kw in spec.keywords
+        ):
            return spec
    return None