Merge PR #398: multi-provider web search
This commit is contained in:
100
README.md
100
README.md
@@ -169,7 +169,9 @@ nanobot channels login
|
|||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> Set your API key in `~/.nanobot/config.json`.
|
> Set your API key in `~/.nanobot/config.json`.
|
||||||
> Get API keys: [OpenRouter](https://openrouter.ai/keys) (Global) · [Brave Search](https://brave.com/search/api/) (optional, for web search)
|
> Get API keys: [OpenRouter](https://openrouter.ai/keys) (Global)
|
||||||
|
>
|
||||||
|
> For web search capability setup, please see [Web Search](#web-search).
|
||||||
|
|
||||||
**1. Initialize**
|
**1. Initialize**
|
||||||
|
|
||||||
@@ -960,6 +962,102 @@ That's it! Environment variables, model prefixing, config matching, and `nanobot
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
### Web Search
|
||||||
|
|
||||||
|
nanobot supports multiple web search providers. Configure in `~/.nanobot/config.json` under `tools.web.search`.
|
||||||
|
|
||||||
|
| Provider | Config fields | Env var fallback | Free |
|
||||||
|
|----------|--------------|------------------|------|
|
||||||
|
| `brave` (default) | `apiKey` | `BRAVE_API_KEY` | No |
|
||||||
|
| `tavily` | `apiKey` | `TAVILY_API_KEY` | No |
|
||||||
|
| `jina` | `apiKey` | `JINA_API_KEY` | Free tier (10M tokens) |
|
||||||
|
| `searxng` | `baseUrl` | `SEARXNG_BASE_URL` | Yes (self-hosted) |
|
||||||
|
| `duckduckgo` | — | — | Yes |
|
||||||
|
|
||||||
|
When credentials are missing, nanobot automatically falls back to DuckDuckGo.
|
||||||
|
|
||||||
|
**Brave** (default):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tools": {
|
||||||
|
"web": {
|
||||||
|
"search": {
|
||||||
|
"provider": "brave",
|
||||||
|
"apiKey": "BSA..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tavily:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tools": {
|
||||||
|
"web": {
|
||||||
|
"search": {
|
||||||
|
"provider": "tavily",
|
||||||
|
"apiKey": "tvly-..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Jina** (free tier with 10M tokens):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tools": {
|
||||||
|
"web": {
|
||||||
|
"search": {
|
||||||
|
"provider": "jina",
|
||||||
|
"apiKey": "jina_..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**SearXNG** (self-hosted, no API key needed):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tools": {
|
||||||
|
"web": {
|
||||||
|
"search": {
|
||||||
|
"provider": "searxng",
|
||||||
|
"baseUrl": "https://searx.example"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**DuckDuckGo** (zero config):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tools": {
|
||||||
|
"web": {
|
||||||
|
"search": {
|
||||||
|
"provider": "duckduckgo"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
| Option | Type | Default | Description |
|
||||||
|
|--------|------|---------|-------------|
|
||||||
|
| `provider` | string | `"brave"` | Search backend: `brave`, `tavily`, `jina`, `searxng`, `duckduckgo` |
|
||||||
|
| `apiKey` | string | `""` | API key for Brave or Tavily |
|
||||||
|
| `baseUrl` | string | `""` | Base URL for SearXNG |
|
||||||
|
| `maxResults` | integer | `5` | Results per search (1–10) |
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> Use `proxy` in `tools.web` to route all web requests (search + fetch) through a proxy:
|
||||||
|
> ```json
|
||||||
|
> { "tools": { "web": { "proxy": "http://127.0.0.1:7890" } } }
|
||||||
|
> ```
|
||||||
|
|
||||||
### MCP (Model Context Protocol)
|
### MCP (Model Context Protocol)
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ from nanobot.providers.base import LLMProvider
|
|||||||
from nanobot.session.manager import Session, SessionManager
|
from nanobot.session.manager import Session, SessionManager
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from nanobot.config.schema import ChannelsConfig, ExecToolConfig
|
from nanobot.config.schema import ChannelsConfig, ExecToolConfig, WebSearchConfig
|
||||||
from nanobot.cron.service import CronService
|
from nanobot.cron.service import CronService
|
||||||
|
|
||||||
|
|
||||||
@@ -55,7 +55,7 @@ class AgentLoop:
|
|||||||
model: str | None = None,
|
model: str | None = None,
|
||||||
max_iterations: int = 40,
|
max_iterations: int = 40,
|
||||||
context_window_tokens: int = 65_536,
|
context_window_tokens: int = 65_536,
|
||||||
brave_api_key: str | None = None,
|
web_search_config: WebSearchConfig | None = None,
|
||||||
web_proxy: str | None = None,
|
web_proxy: str | None = None,
|
||||||
exec_config: ExecToolConfig | None = None,
|
exec_config: ExecToolConfig | None = None,
|
||||||
cron_service: CronService | None = None,
|
cron_service: CronService | None = None,
|
||||||
@@ -64,7 +64,8 @@ class AgentLoop:
|
|||||||
mcp_servers: dict | None = None,
|
mcp_servers: dict | None = None,
|
||||||
channels_config: ChannelsConfig | None = None,
|
channels_config: ChannelsConfig | None = None,
|
||||||
):
|
):
|
||||||
from nanobot.config.schema import ExecToolConfig
|
from nanobot.config.schema import ExecToolConfig, WebSearchConfig
|
||||||
|
|
||||||
self.bus = bus
|
self.bus = bus
|
||||||
self.channels_config = channels_config
|
self.channels_config = channels_config
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
@@ -72,7 +73,7 @@ class AgentLoop:
|
|||||||
self.model = model or provider.get_default_model()
|
self.model = model or provider.get_default_model()
|
||||||
self.max_iterations = max_iterations
|
self.max_iterations = max_iterations
|
||||||
self.context_window_tokens = context_window_tokens
|
self.context_window_tokens = context_window_tokens
|
||||||
self.brave_api_key = brave_api_key
|
self.web_search_config = web_search_config or WebSearchConfig()
|
||||||
self.web_proxy = web_proxy
|
self.web_proxy = web_proxy
|
||||||
self.exec_config = exec_config or ExecToolConfig()
|
self.exec_config = exec_config or ExecToolConfig()
|
||||||
self.cron_service = cron_service
|
self.cron_service = cron_service
|
||||||
@@ -86,7 +87,7 @@ class AgentLoop:
|
|||||||
workspace=workspace,
|
workspace=workspace,
|
||||||
bus=bus,
|
bus=bus,
|
||||||
model=self.model,
|
model=self.model,
|
||||||
brave_api_key=brave_api_key,
|
web_search_config=self.web_search_config,
|
||||||
web_proxy=web_proxy,
|
web_proxy=web_proxy,
|
||||||
exec_config=self.exec_config,
|
exec_config=self.exec_config,
|
||||||
restrict_to_workspace=restrict_to_workspace,
|
restrict_to_workspace=restrict_to_workspace,
|
||||||
@@ -121,7 +122,7 @@ class AgentLoop:
|
|||||||
restrict_to_workspace=self.restrict_to_workspace,
|
restrict_to_workspace=self.restrict_to_workspace,
|
||||||
path_append=self.exec_config.path_append,
|
path_append=self.exec_config.path_append,
|
||||||
))
|
))
|
||||||
self.tools.register(WebSearchTool(api_key=self.brave_api_key, proxy=self.web_proxy))
|
self.tools.register(WebSearchTool(config=self.web_search_config, proxy=self.web_proxy))
|
||||||
self.tools.register(WebFetchTool(proxy=self.web_proxy))
|
self.tools.register(WebFetchTool(proxy=self.web_proxy))
|
||||||
self.tools.register(MessageTool(send_callback=self.bus.publish_outbound))
|
self.tools.register(MessageTool(send_callback=self.bus.publish_outbound))
|
||||||
self.tools.register(SpawnTool(manager=self.subagents))
|
self.tools.register(SpawnTool(manager=self.subagents))
|
||||||
|
|||||||
@@ -28,17 +28,18 @@ class SubagentManager:
|
|||||||
workspace: Path,
|
workspace: Path,
|
||||||
bus: MessageBus,
|
bus: MessageBus,
|
||||||
model: str | None = None,
|
model: str | None = None,
|
||||||
brave_api_key: str | None = None,
|
web_search_config: "WebSearchConfig | None" = None,
|
||||||
web_proxy: str | None = None,
|
web_proxy: str | None = None,
|
||||||
exec_config: "ExecToolConfig | None" = None,
|
exec_config: "ExecToolConfig | None" = None,
|
||||||
restrict_to_workspace: bool = False,
|
restrict_to_workspace: bool = False,
|
||||||
):
|
):
|
||||||
from nanobot.config.schema import ExecToolConfig
|
from nanobot.config.schema import ExecToolConfig, WebSearchConfig
|
||||||
|
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
self.workspace = workspace
|
self.workspace = workspace
|
||||||
self.bus = bus
|
self.bus = bus
|
||||||
self.model = model or provider.get_default_model()
|
self.model = model or provider.get_default_model()
|
||||||
self.brave_api_key = brave_api_key
|
self.web_search_config = web_search_config or WebSearchConfig()
|
||||||
self.web_proxy = web_proxy
|
self.web_proxy = web_proxy
|
||||||
self.exec_config = exec_config or ExecToolConfig()
|
self.exec_config = exec_config or ExecToolConfig()
|
||||||
self.restrict_to_workspace = restrict_to_workspace
|
self.restrict_to_workspace = restrict_to_workspace
|
||||||
@@ -101,7 +102,7 @@ class SubagentManager:
|
|||||||
restrict_to_workspace=self.restrict_to_workspace,
|
restrict_to_workspace=self.restrict_to_workspace,
|
||||||
path_append=self.exec_config.path_append,
|
path_append=self.exec_config.path_append,
|
||||||
))
|
))
|
||||||
tools.register(WebSearchTool(api_key=self.brave_api_key, proxy=self.web_proxy))
|
tools.register(WebSearchTool(config=self.web_search_config, proxy=self.web_proxy))
|
||||||
tools.register(WebFetchTool(proxy=self.web_proxy))
|
tools.register(WebFetchTool(proxy=self.web_proxy))
|
||||||
|
|
||||||
system_prompt = self._build_subagent_prompt()
|
system_prompt = self._build_subagent_prompt()
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
"""Web tools: web_search and web_fetch."""
|
"""Web tools: web_search and web_fetch."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import html
|
import html
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import TYPE_CHECKING, Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@@ -12,6 +15,9 @@ from loguru import logger
|
|||||||
|
|
||||||
from nanobot.agent.tools.base import Tool
|
from nanobot.agent.tools.base import Tool
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from nanobot.config.schema import WebSearchConfig
|
||||||
|
|
||||||
# Shared constants
|
# Shared constants
|
||||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
|
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
|
||||||
MAX_REDIRECTS = 5 # Limit redirects to prevent DoS attacks
|
MAX_REDIRECTS = 5 # Limit redirects to prevent DoS attacks
|
||||||
@@ -44,8 +50,22 @@ def _validate_url(url: str) -> tuple[bool, str]:
|
|||||||
return False, str(e)
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_results(query: str, items: list[dict[str, Any]], n: int) -> str:
|
||||||
|
"""Format provider results into shared plaintext output."""
|
||||||
|
if not items:
|
||||||
|
return f"No results for: {query}"
|
||||||
|
lines = [f"Results for: {query}\n"]
|
||||||
|
for i, item in enumerate(items[:n], 1):
|
||||||
|
title = _normalize(_strip_tags(item.get("title", "")))
|
||||||
|
snippet = _normalize(_strip_tags(item.get("content", "")))
|
||||||
|
lines.append(f"{i}. {title}\n {item.get('url', '')}")
|
||||||
|
if snippet:
|
||||||
|
lines.append(f" {snippet}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
class WebSearchTool(Tool):
|
class WebSearchTool(Tool):
|
||||||
"""Search the web using Brave Search API."""
|
"""Search the web using configured provider."""
|
||||||
|
|
||||||
name = "web_search"
|
name = "web_search"
|
||||||
description = "Search the web. Returns titles, URLs, and snippets."
|
description = "Search the web. Returns titles, URLs, and snippets."
|
||||||
@@ -53,61 +73,140 @@ class WebSearchTool(Tool):
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"query": {"type": "string", "description": "Search query"},
|
"query": {"type": "string", "description": "Search query"},
|
||||||
"count": {"type": "integer", "description": "Results (1-10)", "minimum": 1, "maximum": 10}
|
"count": {"type": "integer", "description": "Results (1-10)", "minimum": 1, "maximum": 10},
|
||||||
},
|
},
|
||||||
"required": ["query"]
|
"required": ["query"],
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, api_key: str | None = None, max_results: int = 5, proxy: str | None = None):
|
def __init__(self, config: WebSearchConfig | None = None, proxy: str | None = None):
|
||||||
self._init_api_key = api_key
|
from nanobot.config.schema import WebSearchConfig
|
||||||
self.max_results = max_results
|
|
||||||
|
self.config = config if config is not None else WebSearchConfig()
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
|
|
||||||
@property
|
|
||||||
def api_key(self) -> str:
|
|
||||||
"""Resolve API key at call time so env/config changes are picked up."""
|
|
||||||
return self._init_api_key or os.environ.get("BRAVE_API_KEY", "")
|
|
||||||
|
|
||||||
async def execute(self, query: str, count: int | None = None, **kwargs: Any) -> str:
|
async def execute(self, query: str, count: int | None = None, **kwargs: Any) -> str:
|
||||||
if not self.api_key:
|
provider = self.config.provider.strip().lower() or "brave"
|
||||||
return (
|
n = min(max(count or self.config.max_results, 1), 10)
|
||||||
"Error: Brave Search API key not configured. Set it in "
|
|
||||||
"~/.nanobot/config.json under tools.web.search.apiKey "
|
|
||||||
"(or export BRAVE_API_KEY), then restart the gateway."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
if provider == "duckduckgo":
|
||||||
|
return await self._search_duckduckgo(query, n)
|
||||||
|
elif provider == "tavily":
|
||||||
|
return await self._search_tavily(query, n)
|
||||||
|
elif provider == "searxng":
|
||||||
|
return await self._search_searxng(query, n)
|
||||||
|
elif provider == "jina":
|
||||||
|
return await self._search_jina(query, n)
|
||||||
|
elif provider == "brave":
|
||||||
|
return await self._search_brave(query, n)
|
||||||
|
else:
|
||||||
|
return f"Error: unknown search provider '{provider}'"
|
||||||
|
|
||||||
|
async def _search_brave(self, query: str, n: int) -> str:
|
||||||
|
api_key = self.config.api_key or os.environ.get("BRAVE_API_KEY", "")
|
||||||
|
if not api_key:
|
||||||
|
logger.warning("BRAVE_API_KEY not set, falling back to DuckDuckGo")
|
||||||
|
return await self._search_duckduckgo(query, n)
|
||||||
try:
|
try:
|
||||||
n = min(max(count or self.max_results, 1), 10)
|
|
||||||
logger.debug("WebSearch: {}", "proxy enabled" if self.proxy else "direct connection")
|
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
r = await client.get(
|
r = await client.get(
|
||||||
"https://api.search.brave.com/res/v1/web/search",
|
"https://api.search.brave.com/res/v1/web/search",
|
||||||
params={"q": query, "count": n},
|
params={"q": query, "count": n},
|
||||||
headers={"Accept": "application/json", "X-Subscription-Token": self.api_key},
|
headers={"Accept": "application/json", "X-Subscription-Token": api_key},
|
||||||
timeout=10.0
|
timeout=10.0,
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
items = [
|
||||||
results = r.json().get("web", {}).get("results", [])[:n]
|
{"title": x.get("title", ""), "url": x.get("url", ""), "content": x.get("description", "")}
|
||||||
if not results:
|
for x in r.json().get("web", {}).get("results", [])
|
||||||
return f"No results for: {query}"
|
]
|
||||||
|
return _format_results(query, items, n)
|
||||||
lines = [f"Results for: {query}\n"]
|
|
||||||
for i, item in enumerate(results, 1):
|
|
||||||
lines.append(f"{i}. {item.get('title', '')}\n {item.get('url', '')}")
|
|
||||||
if desc := item.get("description"):
|
|
||||||
lines.append(f" {desc}")
|
|
||||||
return "\n".join(lines)
|
|
||||||
except httpx.ProxyError as e:
|
|
||||||
logger.error("WebSearch proxy error: {}", e)
|
|
||||||
return f"Proxy error: {e}"
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("WebSearch error: {}", e)
|
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
async def _search_tavily(self, query: str, n: int) -> str:
|
||||||
|
api_key = self.config.api_key or os.environ.get("TAVILY_API_KEY", "")
|
||||||
|
if not api_key:
|
||||||
|
logger.warning("TAVILY_API_KEY not set, falling back to DuckDuckGo")
|
||||||
|
return await self._search_duckduckgo(query, n)
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
|
r = await client.post(
|
||||||
|
"https://api.tavily.com/search",
|
||||||
|
headers={"Authorization": f"Bearer {api_key}"},
|
||||||
|
json={"query": query, "max_results": n},
|
||||||
|
timeout=15.0,
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
return _format_results(query, r.json().get("results", []), n)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
async def _search_searxng(self, query: str, n: int) -> str:
|
||||||
|
base_url = (self.config.base_url or os.environ.get("SEARXNG_BASE_URL", "")).strip()
|
||||||
|
if not base_url:
|
||||||
|
logger.warning("SEARXNG_BASE_URL not set, falling back to DuckDuckGo")
|
||||||
|
return await self._search_duckduckgo(query, n)
|
||||||
|
endpoint = f"{base_url.rstrip('/')}/search"
|
||||||
|
is_valid, error_msg = _validate_url(endpoint)
|
||||||
|
if not is_valid:
|
||||||
|
return f"Error: invalid SearXNG URL: {error_msg}"
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
|
r = await client.get(
|
||||||
|
endpoint,
|
||||||
|
params={"q": query, "format": "json"},
|
||||||
|
headers={"User-Agent": USER_AGENT},
|
||||||
|
timeout=10.0,
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
return _format_results(query, r.json().get("results", []), n)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
async def _search_jina(self, query: str, n: int) -> str:
|
||||||
|
api_key = self.config.api_key or os.environ.get("JINA_API_KEY", "")
|
||||||
|
if not api_key:
|
||||||
|
logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo")
|
||||||
|
return await self._search_duckduckgo(query, n)
|
||||||
|
try:
|
||||||
|
headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||||
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
|
r = await client.get(
|
||||||
|
f"https://s.jina.ai/",
|
||||||
|
params={"q": query},
|
||||||
|
headers=headers,
|
||||||
|
timeout=15.0,
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
data = r.json().get("data", [])[:n]
|
||||||
|
items = [
|
||||||
|
{"title": d.get("title", ""), "url": d.get("url", ""), "content": d.get("content", "")[:500]}
|
||||||
|
for d in data
|
||||||
|
]
|
||||||
|
return _format_results(query, items, n)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
async def _search_duckduckgo(self, query: str, n: int) -> str:
|
||||||
|
try:
|
||||||
|
from ddgs import DDGS
|
||||||
|
|
||||||
|
ddgs = DDGS(timeout=10)
|
||||||
|
raw = await asyncio.to_thread(ddgs.text, query, max_results=n)
|
||||||
|
if not raw:
|
||||||
|
return f"No results for: {query}"
|
||||||
|
items = [
|
||||||
|
{"title": r.get("title", ""), "url": r.get("href", ""), "content": r.get("body", "")}
|
||||||
|
for r in raw
|
||||||
|
]
|
||||||
|
return _format_results(query, items, n)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("DuckDuckGo search failed: {}", e)
|
||||||
|
return f"Error: DuckDuckGo search failed ({e})"
|
||||||
|
|
||||||
|
|
||||||
class WebFetchTool(Tool):
|
class WebFetchTool(Tool):
|
||||||
"""Fetch and extract content from a URL using Readability."""
|
"""Fetch and extract content from a URL."""
|
||||||
|
|
||||||
name = "web_fetch"
|
name = "web_fetch"
|
||||||
description = "Fetch URL and extract readable content (HTML → markdown/text)."
|
description = "Fetch URL and extract readable content (HTML → markdown/text)."
|
||||||
@@ -116,9 +215,9 @@ class WebFetchTool(Tool):
|
|||||||
"properties": {
|
"properties": {
|
||||||
"url": {"type": "string", "description": "URL to fetch"},
|
"url": {"type": "string", "description": "URL to fetch"},
|
||||||
"extractMode": {"type": "string", "enum": ["markdown", "text"], "default": "markdown"},
|
"extractMode": {"type": "string", "enum": ["markdown", "text"], "default": "markdown"},
|
||||||
"maxChars": {"type": "integer", "minimum": 100}
|
"maxChars": {"type": "integer", "minimum": 100},
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"],
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, max_chars: int = 50000, proxy: str | None = None):
|
def __init__(self, max_chars: int = 50000, proxy: str | None = None):
|
||||||
@@ -126,15 +225,55 @@ class WebFetchTool(Tool):
|
|||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
|
|
||||||
async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> str:
|
async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> str:
|
||||||
from readability import Document
|
|
||||||
|
|
||||||
max_chars = maxChars or self.max_chars
|
max_chars = maxChars or self.max_chars
|
||||||
is_valid, error_msg = _validate_url(url)
|
is_valid, error_msg = _validate_url(url)
|
||||||
if not is_valid:
|
if not is_valid:
|
||||||
return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False)
|
return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False)
|
||||||
|
|
||||||
|
result = await self._fetch_jina(url, max_chars)
|
||||||
|
if result is None:
|
||||||
|
result = await self._fetch_readability(url, extractMode, max_chars)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _fetch_jina(self, url: str, max_chars: int) -> str | None:
|
||||||
|
"""Try fetching via Jina Reader API. Returns None on failure."""
|
||||||
|
try:
|
||||||
|
headers = {"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||||
|
jina_key = os.environ.get("JINA_API_KEY", "")
|
||||||
|
if jina_key:
|
||||||
|
headers["Authorization"] = f"Bearer {jina_key}"
|
||||||
|
async with httpx.AsyncClient(proxy=self.proxy, timeout=20.0) as client:
|
||||||
|
r = await client.get(f"https://r.jina.ai/{url}", headers=headers)
|
||||||
|
if r.status_code == 429:
|
||||||
|
logger.debug("Jina Reader rate limited, falling back to readability")
|
||||||
|
return None
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
data = r.json().get("data", {})
|
||||||
|
title = data.get("title", "")
|
||||||
|
text = data.get("content", "")
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if title:
|
||||||
|
text = f"# {title}\n\n{text}"
|
||||||
|
truncated = len(text) > max_chars
|
||||||
|
if truncated:
|
||||||
|
text = text[:max_chars]
|
||||||
|
|
||||||
|
return json.dumps({
|
||||||
|
"url": url, "finalUrl": data.get("url", url), "status": r.status_code,
|
||||||
|
"extractor": "jina", "truncated": truncated, "length": len(text), "text": text,
|
||||||
|
}, ensure_ascii=False)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Jina Reader failed for {}, falling back to readability: {}", url, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _fetch_readability(self, url: str, extract_mode: str, max_chars: int) -> str:
|
||||||
|
"""Local fallback using readability-lxml."""
|
||||||
|
from readability import Document
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug("WebFetch: {}", "proxy enabled" if self.proxy else "direct connection")
|
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
max_redirects=MAX_REDIRECTS,
|
max_redirects=MAX_REDIRECTS,
|
||||||
@@ -150,17 +289,20 @@ class WebFetchTool(Tool):
|
|||||||
text, extractor = json.dumps(r.json(), indent=2, ensure_ascii=False), "json"
|
text, extractor = json.dumps(r.json(), indent=2, ensure_ascii=False), "json"
|
||||||
elif "text/html" in ctype or r.text[:256].lower().startswith(("<!doctype", "<html")):
|
elif "text/html" in ctype or r.text[:256].lower().startswith(("<!doctype", "<html")):
|
||||||
doc = Document(r.text)
|
doc = Document(r.text)
|
||||||
content = self._to_markdown(doc.summary()) if extractMode == "markdown" else _strip_tags(doc.summary())
|
content = self._to_markdown(doc.summary()) if extract_mode == "markdown" else _strip_tags(doc.summary())
|
||||||
text = f"# {doc.title()}\n\n{content}" if doc.title() else content
|
text = f"# {doc.title()}\n\n{content}" if doc.title() else content
|
||||||
extractor = "readability"
|
extractor = "readability"
|
||||||
else:
|
else:
|
||||||
text, extractor = r.text, "raw"
|
text, extractor = r.text, "raw"
|
||||||
|
|
||||||
truncated = len(text) > max_chars
|
truncated = len(text) > max_chars
|
||||||
if truncated: text = text[:max_chars]
|
if truncated:
|
||||||
|
text = text[:max_chars]
|
||||||
|
|
||||||
return json.dumps({"url": url, "finalUrl": str(r.url), "status": r.status_code,
|
return json.dumps({
|
||||||
"extractor": extractor, "truncated": truncated, "length": len(text), "text": text}, ensure_ascii=False)
|
"url": url, "finalUrl": str(r.url), "status": r.status_code,
|
||||||
|
"extractor": extractor, "truncated": truncated, "length": len(text), "text": text,
|
||||||
|
}, ensure_ascii=False)
|
||||||
except httpx.ProxyError as e:
|
except httpx.ProxyError as e:
|
||||||
logger.error("WebFetch proxy error for {}: {}", url, e)
|
logger.error("WebFetch proxy error for {}: {}", url, e)
|
||||||
return json.dumps({"error": f"Proxy error: {e}", "url": url}, ensure_ascii=False)
|
return json.dumps({"error": f"Proxy error: {e}", "url": url}, ensure_ascii=False)
|
||||||
@@ -168,11 +310,10 @@ class WebFetchTool(Tool):
|
|||||||
logger.error("WebFetch error for {}: {}", url, e)
|
logger.error("WebFetch error for {}: {}", url, e)
|
||||||
return json.dumps({"error": str(e), "url": url}, ensure_ascii=False)
|
return json.dumps({"error": str(e), "url": url}, ensure_ascii=False)
|
||||||
|
|
||||||
def _to_markdown(self, html: str) -> str:
|
def _to_markdown(self, html_content: str) -> str:
|
||||||
"""Convert HTML to markdown."""
|
"""Convert HTML to markdown."""
|
||||||
# Convert links, headings, lists before stripping tags
|
|
||||||
text = re.sub(r'<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>([\s\S]*?)</a>',
|
text = re.sub(r'<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>([\s\S]*?)</a>',
|
||||||
lambda m: f'[{_strip_tags(m[2])}]({m[1]})', html, flags=re.I)
|
lambda m: f'[{_strip_tags(m[2])}]({m[1]})', html_content, flags=re.I)
|
||||||
text = re.sub(r'<h([1-6])[^>]*>([\s\S]*?)</h\1>',
|
text = re.sub(r'<h([1-6])[^>]*>([\s\S]*?)</h\1>',
|
||||||
lambda m: f'\n{"#" * int(m[1])} {_strip_tags(m[2])}\n', text, flags=re.I)
|
lambda m: f'\n{"#" * int(m[1])} {_strip_tags(m[2])}\n', text, flags=re.I)
|
||||||
text = re.sub(r'<li[^>]*>([\s\S]*?)</li>', lambda m: f'\n- {_strip_tags(m[1])}', text, flags=re.I)
|
text = re.sub(r'<li[^>]*>([\s\S]*?)</li>', lambda m: f'\n- {_strip_tags(m[1])}', text, flags=re.I)
|
||||||
|
|||||||
@@ -395,7 +395,7 @@ def gateway(
|
|||||||
model=config.agents.defaults.model,
|
model=config.agents.defaults.model,
|
||||||
max_iterations=config.agents.defaults.max_tool_iterations,
|
max_iterations=config.agents.defaults.max_tool_iterations,
|
||||||
context_window_tokens=config.agents.defaults.context_window_tokens,
|
context_window_tokens=config.agents.defaults.context_window_tokens,
|
||||||
brave_api_key=config.tools.web.search.api_key or None,
|
web_search_config=config.tools.web.search,
|
||||||
web_proxy=config.tools.web.proxy or None,
|
web_proxy=config.tools.web.proxy or None,
|
||||||
exec_config=config.tools.exec,
|
exec_config=config.tools.exec,
|
||||||
cron_service=cron,
|
cron_service=cron,
|
||||||
@@ -578,7 +578,7 @@ def agent(
|
|||||||
model=config.agents.defaults.model,
|
model=config.agents.defaults.model,
|
||||||
max_iterations=config.agents.defaults.max_tool_iterations,
|
max_iterations=config.agents.defaults.max_tool_iterations,
|
||||||
context_window_tokens=config.agents.defaults.context_window_tokens,
|
context_window_tokens=config.agents.defaults.context_window_tokens,
|
||||||
brave_api_key=config.tools.web.search.api_key or None,
|
web_search_config=config.tools.web.search,
|
||||||
web_proxy=config.tools.web.proxy or None,
|
web_proxy=config.tools.web.proxy or None,
|
||||||
exec_config=config.tools.exec,
|
exec_config=config.tools.exec,
|
||||||
cron_service=cron,
|
cron_service=cron,
|
||||||
|
|||||||
@@ -310,7 +310,9 @@ class GatewayConfig(Base):
|
|||||||
class WebSearchConfig(Base):
|
class WebSearchConfig(Base):
|
||||||
"""Web search tool configuration."""
|
"""Web search tool configuration."""
|
||||||
|
|
||||||
api_key: str = "" # Brave Search API key
|
provider: str = "brave" # brave, tavily, duckduckgo, searxng, jina
|
||||||
|
api_key: str = ""
|
||||||
|
base_url: str = "" # SearXNG base URL
|
||||||
max_results: int = 5
|
max_results: int = 5
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ dependencies = [
|
|||||||
"websockets>=16.0,<17.0",
|
"websockets>=16.0,<17.0",
|
||||||
"websocket-client>=1.9.0,<2.0.0",
|
"websocket-client>=1.9.0,<2.0.0",
|
||||||
"httpx>=0.28.0,<1.0.0",
|
"httpx>=0.28.0,<1.0.0",
|
||||||
|
"ddgs>=9.5.5,<10.0.0",
|
||||||
"oauth-cli-kit>=0.1.3,<1.0.0",
|
"oauth-cli-kit>=0.1.3,<1.0.0",
|
||||||
"loguru>=0.7.3,<1.0.0",
|
"loguru>=0.7.3,<1.0.0",
|
||||||
"readability-lxml>=0.8.4,<1.0.0",
|
"readability-lxml>=0.8.4,<1.0.0",
|
||||||
|
|||||||
162
tests/test_web_search_tool.py
Normal file
162
tests/test_web_search_tool.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
"""Tests for multi-provider web search."""
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nanobot.agent.tools.web import WebSearchTool
|
||||||
|
from nanobot.config.schema import WebSearchConfig
|
||||||
|
|
||||||
|
|
||||||
|
def _tool(provider: str = "brave", api_key: str = "", base_url: str = "") -> WebSearchTool:
|
||||||
|
return WebSearchTool(config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url))
|
||||||
|
|
||||||
|
|
||||||
|
def _response(status: int = 200, json: dict | None = None) -> httpx.Response:
|
||||||
|
"""Build a mock httpx.Response with a dummy request attached."""
|
||||||
|
r = httpx.Response(status, json=json)
|
||||||
|
r._request = httpx.Request("GET", "https://mock")
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_brave_search(monkeypatch):
|
||||||
|
async def mock_get(self, url, **kw):
|
||||||
|
assert "brave" in url
|
||||||
|
assert kw["headers"]["X-Subscription-Token"] == "brave-key"
|
||||||
|
return _response(json={
|
||||||
|
"web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]}
|
||||||
|
})
|
||||||
|
|
||||||
|
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||||
|
tool = _tool(provider="brave", api_key="brave-key")
|
||||||
|
result = await tool.execute(query="nanobot", count=1)
|
||||||
|
assert "NanoBot" in result
|
||||||
|
assert "https://example.com" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tavily_search(monkeypatch):
|
||||||
|
async def mock_post(self, url, **kw):
|
||||||
|
assert "tavily" in url
|
||||||
|
assert kw["headers"]["Authorization"] == "Bearer tavily-key"
|
||||||
|
return _response(json={
|
||||||
|
"results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}]
|
||||||
|
})
|
||||||
|
|
||||||
|
monkeypatch.setattr(httpx.AsyncClient, "post", mock_post)
|
||||||
|
tool = _tool(provider="tavily", api_key="tavily-key")
|
||||||
|
result = await tool.execute(query="openclaw")
|
||||||
|
assert "OpenClaw" in result
|
||||||
|
assert "https://openclaw.io" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_searxng_search(monkeypatch):
|
||||||
|
async def mock_get(self, url, **kw):
|
||||||
|
assert "searx.example" in url
|
||||||
|
return _response(json={
|
||||||
|
"results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}]
|
||||||
|
})
|
||||||
|
|
||||||
|
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||||
|
tool = _tool(provider="searxng", base_url="https://searx.example")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "Result" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_duckduckgo_search(monkeypatch):
|
||||||
|
class MockDDGS:
|
||||||
|
def __init__(self, **kw):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def text(self, query, max_results=5):
|
||||||
|
return [{"title": "DDG Result", "href": "https://ddg.example", "body": "From DuckDuckGo"}]
|
||||||
|
|
||||||
|
monkeypatch.setattr("nanobot.agent.tools.web.DDGS", MockDDGS, raising=False)
|
||||||
|
import nanobot.agent.tools.web as web_mod
|
||||||
|
monkeypatch.setattr(web_mod, "DDGS", MockDDGS, raising=False)
|
||||||
|
|
||||||
|
from ddgs import DDGS
|
||||||
|
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||||
|
|
||||||
|
tool = _tool(provider="duckduckgo")
|
||||||
|
result = await tool.execute(query="hello")
|
||||||
|
assert "DDG Result" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_brave_fallback_to_duckduckgo_when_no_key(monkeypatch):
|
||||||
|
class MockDDGS:
|
||||||
|
def __init__(self, **kw):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def text(self, query, max_results=5):
|
||||||
|
return [{"title": "Fallback", "href": "https://ddg.example", "body": "DuckDuckGo fallback"}]
|
||||||
|
|
||||||
|
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||||
|
monkeypatch.delenv("BRAVE_API_KEY", raising=False)
|
||||||
|
|
||||||
|
tool = _tool(provider="brave", api_key="")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "Fallback" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_jina_search(monkeypatch):
|
||||||
|
async def mock_get(self, url, **kw):
|
||||||
|
assert "s.jina.ai" in str(url)
|
||||||
|
assert kw["headers"]["Authorization"] == "Bearer jina-key"
|
||||||
|
return _response(json={
|
||||||
|
"data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}]
|
||||||
|
})
|
||||||
|
|
||||||
|
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||||
|
tool = _tool(provider="jina", api_key="jina-key")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "Jina Result" in result
|
||||||
|
assert "https://jina.ai" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unknown_provider():
|
||||||
|
tool = _tool(provider="unknown")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "unknown" in result
|
||||||
|
assert "Error" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_default_provider_is_brave(monkeypatch):
|
||||||
|
async def mock_get(self, url, **kw):
|
||||||
|
assert "brave" in url
|
||||||
|
return _response(json={"web": {"results": []}})
|
||||||
|
|
||||||
|
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||||
|
tool = _tool(provider="", api_key="test-key")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "No results" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_searxng_no_base_url_falls_back(monkeypatch):
|
||||||
|
class MockDDGS:
|
||||||
|
def __init__(self, **kw):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def text(self, query, max_results=5):
|
||||||
|
return [{"title": "Fallback", "href": "https://ddg.example", "body": "fallback"}]
|
||||||
|
|
||||||
|
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||||
|
monkeypatch.delenv("SEARXNG_BASE_URL", raising=False)
|
||||||
|
|
||||||
|
tool = _tool(provider="searxng", base_url="")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "Fallback" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_searxng_invalid_url():
|
||||||
|
tool = _tool(provider="searxng", base_url="not-a-url")
|
||||||
|
result = await tool.execute(query="test")
|
||||||
|
assert "Error" in result
|
||||||
Reference in New Issue
Block a user