From f64ae3b900df63018a385bb0b0f51453f7a555b6 Mon Sep 17 00:00:00 2001 From: Desmond Sow Date: Wed, 18 Mar 2026 15:02:47 +0800 Subject: [PATCH] feat(provider): add OpenVINO Model Server provider (#2193) add OpenVINO Model Server provider --- README.md | 76 +++++++++++++++++++++++++++++++++++ nanobot/cli/commands.py | 8 ++++ nanobot/config/schema.py | 1 + nanobot/providers/registry.py | 11 +++++ 4 files changed, 96 insertions(+) diff --git a/README.md b/README.md index 64ae157..52d4504 100644 --- a/README.md +++ b/README.md @@ -803,6 +803,7 @@ Config file: `~/.nanobot/config.json` | `moonshot` | LLM (Moonshot/Kimi) | [platform.moonshot.cn](https://platform.moonshot.cn) | | `zhipu` | LLM (Zhipu GLM) | [open.bigmodel.cn](https://open.bigmodel.cn) | | `ollama` | LLM (local, Ollama) | — | +| `ovms` | LLM (local, OpenVINO Model Server) | [docs.openvino.ai](https://docs.openvino.ai/2026/model-server/ovms_docs_llm_quickstart.html) | | `vllm` | LLM (local, any OpenAI-compatible server) | — | | `openai_codex` | LLM (Codex, OAuth) | `nanobot provider login openai-codex` | | `github_copilot` | LLM (GitHub Copilot, OAuth) | `nanobot provider login github-copilot` | @@ -938,6 +939,81 @@ ollama run llama3.2 +
+OpenVINO Model Server (local / OpenAI-compatible) + +Run LLMs locally on Intel GPUs using [OpenVINO Model Server](https://docs.openvino.ai/2026/model-server/ovms_docs_llm_quickstart.html). OVMS exposes an OpenAI-compatible API at `/v3`. + +> Requires Docker and an Intel GPU with driver access (`/dev/dri`). + +**1. Pull the model** (example): + +```bash +mkdir -p ov/models && cd ov + +docker run -d \ + --rm \ + --user $(id -u):$(id -g) \ + -v $(pwd)/models:/models \ + openvino/model_server:latest-gpu \ + --pull \ + --model_name openai/gpt-oss-20b \ + --model_repository_path /models \ + --source_model OpenVINO/gpt-oss-20b-int4-ov \ + --task text_generation \ + --tool_parser gptoss \ + --reasoning_parser gptoss \ + --enable_prefix_caching true \ + --target_device GPU +``` + +> This downloads the model weights. Wait for the container to finish before proceeding. + +**2. Start the server** (example): + +```bash +docker run -d \ + --rm \ + --name ovms \ + --user $(id -u):$(id -g) \ + -p 8000:8000 \ + -v $(pwd)/models:/models \ + --device /dev/dri \ + --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \ + openvino/model_server:latest-gpu \ + --rest_port 8000 \ + --model_name openai/gpt-oss-20b \ + --model_repository_path /models \ + --source_model OpenVINO/gpt-oss-20b-int4-ov \ + --task text_generation \ + --tool_parser gptoss \ + --reasoning_parser gptoss \ + --enable_prefix_caching true \ + --target_device GPU +``` + +**3. Add to config** (partial — merge into `~/.nanobot/config.json`): + +```json +{ + "providers": { + "ovms": { + "apiBase": "http://localhost:8000/v3" + } + }, + "agents": { + "defaults": { + "provider": "ovms", + "model": "openai/gpt-oss-20b" + } + } +} +``` + +> OVMS is a local server — no API key required. Supports tool calling (`--tool_parser gptoss`), reasoning (`--reasoning_parser gptoss`), and streaming. +> See the [official OVMS docs](https://docs.openvino.ai/2026/model-server/ovms_docs_llm_quickstart.html) for more details. +
+
vLLM (local / OpenAI-compatible) diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py index b915ce9..db348ed 100644 --- a/nanobot/cli/commands.py +++ b/nanobot/cli/commands.py @@ -409,6 +409,14 @@ def _make_provider(config: Config): api_base=p.api_base, default_model=model, ) + # OpenVINO Model Server: direct OpenAI-compatible endpoint at /v3 + elif provider_name == "ovms": + from nanobot.providers.custom_provider import CustomProvider + provider = CustomProvider( + api_key=p.api_key if p else "no-key", + api_base=config.get_api_base(model) or "http://localhost:8000/v3", + default_model=model, + ) else: from nanobot.providers.litellm_provider import LiteLLMProvider from nanobot.providers.registry import find_by_name diff --git a/nanobot/config/schema.py b/nanobot/config/schema.py index 9c841ca..58ead15 100644 --- a/nanobot/config/schema.py +++ b/nanobot/config/schema.py @@ -70,6 +70,7 @@ class ProvidersConfig(Base): dashscope: ProviderConfig = Field(default_factory=ProviderConfig) vllm: ProviderConfig = Field(default_factory=ProviderConfig) ollama: ProviderConfig = Field(default_factory=ProviderConfig) # Ollama local models + ovms: ProviderConfig = Field(default_factory=ProviderConfig) # OpenVINO Model Server (OVMS) gemini: ProviderConfig = Field(default_factory=ProviderConfig) moonshot: ProviderConfig = Field(default_factory=ProviderConfig) minimax: ProviderConfig = Field(default_factory=ProviderConfig) diff --git a/nanobot/providers/registry.py b/nanobot/providers/registry.py index 825653f..9cc430b 100644 --- a/nanobot/providers/registry.py +++ b/nanobot/providers/registry.py @@ -452,6 +452,17 @@ PROVIDERS: tuple[ProviderSpec, ...] = ( strip_model_prefix=False, model_overrides=(), ), + # === OpenVINO Model Server (direct, local, OpenAI-compatible at /v3) === + ProviderSpec( + name="ovms", + keywords=("openvino", "ovms"), + env_key="", + display_name="OpenVINO Model Server", + litellm_prefix="", + is_direct=True, + is_local=True, + default_api_base="http://localhost:8000/v3", + ), # === Auxiliary (not a primary LLM provider) ============================ # Groq: mainly used for Whisper voice transcription, also usable for LLM. # Needs "groq/" prefix for LiteLLM routing. Placed last — it rarely wins fallback.