feat(web): multi-provider web search + Jina Reader fetch
This commit is contained in:
100
README.md
100
README.md
@@ -169,7 +169,9 @@ nanobot channels login
|
||||
|
||||
> [!TIP]
|
||||
> Set your API key in `~/.nanobot/config.json`.
|
||||
> Get API keys: [OpenRouter](https://openrouter.ai/keys) (Global) · [Brave Search](https://brave.com/search/api/) (optional, for web search)
|
||||
> Get API keys: [OpenRouter](https://openrouter.ai/keys) (Global)
|
||||
>
|
||||
> For web search capability setup, please see [Web Search](#web-search).
|
||||
|
||||
**1. Initialize**
|
||||
|
||||
@@ -960,6 +962,102 @@ That's it! Environment variables, model prefixing, config matching, and `nanobot
|
||||
</details>
|
||||
|
||||
|
||||
### Web Search
|
||||
|
||||
nanobot supports multiple web search providers. Configure in `~/.nanobot/config.json` under `tools.web.search`.
|
||||
|
||||
| Provider | Config fields | Env var fallback | Free |
|
||||
|----------|--------------|------------------|------|
|
||||
| `brave` (default) | `apiKey` | `BRAVE_API_KEY` | No |
|
||||
| `tavily` | `apiKey` | `TAVILY_API_KEY` | No |
|
||||
| `jina` | `apiKey` | `JINA_API_KEY` | Free tier (10M tokens) |
|
||||
| `searxng` | `baseUrl` | `SEARXNG_BASE_URL` | Yes (self-hosted) |
|
||||
| `duckduckgo` | — | — | Yes |
|
||||
|
||||
When credentials are missing, nanobot automatically falls back to DuckDuckGo.
|
||||
|
||||
**Brave** (default):
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"web": {
|
||||
"search": {
|
||||
"provider": "brave",
|
||||
"apiKey": "BSA..."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Tavily:**
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"web": {
|
||||
"search": {
|
||||
"provider": "tavily",
|
||||
"apiKey": "tvly-..."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Jina** (free tier with 10M tokens):
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"web": {
|
||||
"search": {
|
||||
"provider": "jina",
|
||||
"apiKey": "jina_..."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**SearXNG** (self-hosted, no API key needed):
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"web": {
|
||||
"search": {
|
||||
"provider": "searxng",
|
||||
"baseUrl": "https://searx.example"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**DuckDuckGo** (zero config):
|
||||
```json
|
||||
{
|
||||
"tools": {
|
||||
"web": {
|
||||
"search": {
|
||||
"provider": "duckduckgo"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `provider` | string | `"brave"` | Search backend: `brave`, `tavily`, `jina`, `searxng`, `duckduckgo` |
|
||||
| `apiKey` | string | `""` | API key for Brave or Tavily |
|
||||
| `baseUrl` | string | `""` | Base URL for SearXNG |
|
||||
| `maxResults` | integer | `5` | Results per search (1–10) |
|
||||
|
||||
> [!TIP]
|
||||
> Use `proxy` in `tools.web` to route all web requests (search + fetch) through a proxy:
|
||||
> ```json
|
||||
> { "tools": { "web": { "proxy": "http://127.0.0.1:7890" } } }
|
||||
> ```
|
||||
|
||||
### MCP (Model Context Protocol)
|
||||
|
||||
> [!TIP]
|
||||
|
||||
@@ -29,7 +29,7 @@ from nanobot.providers.base import LLMProvider
|
||||
from nanobot.session.manager import Session, SessionManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanobot.config.schema import ChannelsConfig, ExecToolConfig
|
||||
from nanobot.config.schema import ChannelsConfig, ExecToolConfig, WebSearchConfig
|
||||
from nanobot.cron.service import CronService
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ class AgentLoop:
|
||||
model: str | None = None,
|
||||
max_iterations: int = 40,
|
||||
context_window_tokens: int = 65_536,
|
||||
brave_api_key: str | None = None,
|
||||
web_search_config: WebSearchConfig | None = None,
|
||||
web_proxy: str | None = None,
|
||||
exec_config: ExecToolConfig | None = None,
|
||||
cron_service: CronService | None = None,
|
||||
@@ -64,7 +64,8 @@ class AgentLoop:
|
||||
mcp_servers: dict | None = None,
|
||||
channels_config: ChannelsConfig | None = None,
|
||||
):
|
||||
from nanobot.config.schema import ExecToolConfig
|
||||
from nanobot.config.schema import ExecToolConfig, WebSearchConfig
|
||||
|
||||
self.bus = bus
|
||||
self.channels_config = channels_config
|
||||
self.provider = provider
|
||||
@@ -72,7 +73,7 @@ class AgentLoop:
|
||||
self.model = model or provider.get_default_model()
|
||||
self.max_iterations = max_iterations
|
||||
self.context_window_tokens = context_window_tokens
|
||||
self.brave_api_key = brave_api_key
|
||||
self.web_search_config = web_search_config or WebSearchConfig()
|
||||
self.web_proxy = web_proxy
|
||||
self.exec_config = exec_config or ExecToolConfig()
|
||||
self.cron_service = cron_service
|
||||
@@ -86,7 +87,7 @@ class AgentLoop:
|
||||
workspace=workspace,
|
||||
bus=bus,
|
||||
model=self.model,
|
||||
brave_api_key=brave_api_key,
|
||||
web_search_config=self.web_search_config,
|
||||
web_proxy=web_proxy,
|
||||
exec_config=self.exec_config,
|
||||
restrict_to_workspace=restrict_to_workspace,
|
||||
@@ -121,7 +122,7 @@ class AgentLoop:
|
||||
restrict_to_workspace=self.restrict_to_workspace,
|
||||
path_append=self.exec_config.path_append,
|
||||
))
|
||||
self.tools.register(WebSearchTool(api_key=self.brave_api_key, proxy=self.web_proxy))
|
||||
self.tools.register(WebSearchTool(config=self.web_search_config, proxy=self.web_proxy))
|
||||
self.tools.register(WebFetchTool(proxy=self.web_proxy))
|
||||
self.tools.register(MessageTool(send_callback=self.bus.publish_outbound))
|
||||
self.tools.register(SpawnTool(manager=self.subagents))
|
||||
|
||||
@@ -28,17 +28,18 @@ class SubagentManager:
|
||||
workspace: Path,
|
||||
bus: MessageBus,
|
||||
model: str | None = None,
|
||||
brave_api_key: str | None = None,
|
||||
web_search_config: "WebSearchConfig | None" = None,
|
||||
web_proxy: str | None = None,
|
||||
exec_config: "ExecToolConfig | None" = None,
|
||||
restrict_to_workspace: bool = False,
|
||||
):
|
||||
from nanobot.config.schema import ExecToolConfig
|
||||
from nanobot.config.schema import ExecToolConfig, WebSearchConfig
|
||||
|
||||
self.provider = provider
|
||||
self.workspace = workspace
|
||||
self.bus = bus
|
||||
self.model = model or provider.get_default_model()
|
||||
self.brave_api_key = brave_api_key
|
||||
self.web_search_config = web_search_config or WebSearchConfig()
|
||||
self.web_proxy = web_proxy
|
||||
self.exec_config = exec_config or ExecToolConfig()
|
||||
self.restrict_to_workspace = restrict_to_workspace
|
||||
@@ -101,7 +102,7 @@ class SubagentManager:
|
||||
restrict_to_workspace=self.restrict_to_workspace,
|
||||
path_append=self.exec_config.path_append,
|
||||
))
|
||||
tools.register(WebSearchTool(api_key=self.brave_api_key, proxy=self.web_proxy))
|
||||
tools.register(WebSearchTool(config=self.web_search_config, proxy=self.web_proxy))
|
||||
tools.register(WebFetchTool(proxy=self.web_proxy))
|
||||
|
||||
system_prompt = self._build_subagent_prompt()
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
"""Web tools: web_search and web_fetch."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
@@ -12,6 +15,9 @@ from loguru import logger
|
||||
|
||||
from nanobot.agent.tools.base import Tool
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanobot.config.schema import WebSearchConfig
|
||||
|
||||
# Shared constants
|
||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36"
|
||||
MAX_REDIRECTS = 5 # Limit redirects to prevent DoS attacks
|
||||
@@ -44,8 +50,22 @@ def _validate_url(url: str) -> tuple[bool, str]:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def _format_results(query: str, items: list[dict[str, Any]], n: int) -> str:
|
||||
"""Format provider results into shared plaintext output."""
|
||||
if not items:
|
||||
return f"No results for: {query}"
|
||||
lines = [f"Results for: {query}\n"]
|
||||
for i, item in enumerate(items[:n], 1):
|
||||
title = _normalize(_strip_tags(item.get("title", "")))
|
||||
snippet = _normalize(_strip_tags(item.get("content", "")))
|
||||
lines.append(f"{i}. {title}\n {item.get('url', '')}")
|
||||
if snippet:
|
||||
lines.append(f" {snippet}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class WebSearchTool(Tool):
|
||||
"""Search the web using Brave Search API."""
|
||||
"""Search the web using configured provider."""
|
||||
|
||||
name = "web_search"
|
||||
description = "Search the web. Returns titles, URLs, and snippets."
|
||||
@@ -53,61 +73,140 @@ class WebSearchTool(Tool):
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "Search query"},
|
||||
"count": {"type": "integer", "description": "Results (1-10)", "minimum": 1, "maximum": 10}
|
||||
"count": {"type": "integer", "description": "Results (1-10)", "minimum": 1, "maximum": 10},
|
||||
},
|
||||
"required": ["query"]
|
||||
"required": ["query"],
|
||||
}
|
||||
|
||||
def __init__(self, api_key: str | None = None, max_results: int = 5, proxy: str | None = None):
|
||||
self._init_api_key = api_key
|
||||
self.max_results = max_results
|
||||
def __init__(self, config: WebSearchConfig | None = None, proxy: str | None = None):
|
||||
from nanobot.config.schema import WebSearchConfig
|
||||
|
||||
self.config = config if config is not None else WebSearchConfig()
|
||||
self.proxy = proxy
|
||||
|
||||
@property
|
||||
def api_key(self) -> str:
|
||||
"""Resolve API key at call time so env/config changes are picked up."""
|
||||
return self._init_api_key or os.environ.get("BRAVE_API_KEY", "")
|
||||
|
||||
async def execute(self, query: str, count: int | None = None, **kwargs: Any) -> str:
|
||||
if not self.api_key:
|
||||
return (
|
||||
"Error: Brave Search API key not configured. Set it in "
|
||||
"~/.nanobot/config.json under tools.web.search.apiKey "
|
||||
"(or export BRAVE_API_KEY), then restart the gateway."
|
||||
)
|
||||
provider = self.config.provider.strip().lower() or "brave"
|
||||
n = min(max(count or self.config.max_results, 1), 10)
|
||||
|
||||
if provider == "duckduckgo":
|
||||
return await self._search_duckduckgo(query, n)
|
||||
elif provider == "tavily":
|
||||
return await self._search_tavily(query, n)
|
||||
elif provider == "searxng":
|
||||
return await self._search_searxng(query, n)
|
||||
elif provider == "jina":
|
||||
return await self._search_jina(query, n)
|
||||
elif provider == "brave":
|
||||
return await self._search_brave(query, n)
|
||||
else:
|
||||
return f"Error: unknown search provider '{provider}'"
|
||||
|
||||
async def _search_brave(self, query: str, n: int) -> str:
|
||||
api_key = self.config.api_key or os.environ.get("BRAVE_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("BRAVE_API_KEY not set, falling back to DuckDuckGo")
|
||||
return await self._search_duckduckgo(query, n)
|
||||
try:
|
||||
n = min(max(count or self.max_results, 1), 10)
|
||||
logger.debug("WebSearch: {}", "proxy enabled" if self.proxy else "direct connection")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
r = await client.get(
|
||||
"https://api.search.brave.com/res/v1/web/search",
|
||||
params={"q": query, "count": n},
|
||||
headers={"Accept": "application/json", "X-Subscription-Token": self.api_key},
|
||||
timeout=10.0
|
||||
headers={"Accept": "application/json", "X-Subscription-Token": api_key},
|
||||
timeout=10.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
|
||||
results = r.json().get("web", {}).get("results", [])[:n]
|
||||
if not results:
|
||||
return f"No results for: {query}"
|
||||
|
||||
lines = [f"Results for: {query}\n"]
|
||||
for i, item in enumerate(results, 1):
|
||||
lines.append(f"{i}. {item.get('title', '')}\n {item.get('url', '')}")
|
||||
if desc := item.get("description"):
|
||||
lines.append(f" {desc}")
|
||||
return "\n".join(lines)
|
||||
except httpx.ProxyError as e:
|
||||
logger.error("WebSearch proxy error: {}", e)
|
||||
return f"Proxy error: {e}"
|
||||
items = [
|
||||
{"title": x.get("title", ""), "url": x.get("url", ""), "content": x.get("description", "")}
|
||||
for x in r.json().get("web", {}).get("results", [])
|
||||
]
|
||||
return _format_results(query, items, n)
|
||||
except Exception as e:
|
||||
logger.error("WebSearch error: {}", e)
|
||||
return f"Error: {e}"
|
||||
|
||||
async def _search_tavily(self, query: str, n: int) -> str:
|
||||
api_key = self.config.api_key or os.environ.get("TAVILY_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("TAVILY_API_KEY not set, falling back to DuckDuckGo")
|
||||
return await self._search_duckduckgo(query, n)
|
||||
try:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
r = await client.post(
|
||||
"https://api.tavily.com/search",
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
json={"query": query, "max_results": n},
|
||||
timeout=15.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return _format_results(query, r.json().get("results", []), n)
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
async def _search_searxng(self, query: str, n: int) -> str:
|
||||
base_url = (self.config.base_url or os.environ.get("SEARXNG_BASE_URL", "")).strip()
|
||||
if not base_url:
|
||||
logger.warning("SEARXNG_BASE_URL not set, falling back to DuckDuckGo")
|
||||
return await self._search_duckduckgo(query, n)
|
||||
endpoint = f"{base_url.rstrip('/')}/search"
|
||||
is_valid, error_msg = _validate_url(endpoint)
|
||||
if not is_valid:
|
||||
return f"Error: invalid SearXNG URL: {error_msg}"
|
||||
try:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
r = await client.get(
|
||||
endpoint,
|
||||
params={"q": query, "format": "json"},
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
timeout=10.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return _format_results(query, r.json().get("results", []), n)
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
async def _search_jina(self, query: str, n: int) -> str:
|
||||
api_key = self.config.api_key or os.environ.get("JINA_API_KEY", "")
|
||||
if not api_key:
|
||||
logger.warning("JINA_API_KEY not set, falling back to DuckDuckGo")
|
||||
return await self._search_duckduckgo(query, n)
|
||||
try:
|
||||
headers = {"Accept": "application/json", "Authorization": f"Bearer {api_key}"}
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
r = await client.get(
|
||||
f"https://s.jina.ai/",
|
||||
params={"q": query},
|
||||
headers=headers,
|
||||
timeout=15.0,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json().get("data", [])[:n]
|
||||
items = [
|
||||
{"title": d.get("title", ""), "url": d.get("url", ""), "content": d.get("content", "")[:500]}
|
||||
for d in data
|
||||
]
|
||||
return _format_results(query, items, n)
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
async def _search_duckduckgo(self, query: str, n: int) -> str:
|
||||
try:
|
||||
from ddgs import DDGS
|
||||
|
||||
ddgs = DDGS(timeout=10)
|
||||
raw = await asyncio.to_thread(ddgs.text, query, max_results=n)
|
||||
if not raw:
|
||||
return f"No results for: {query}"
|
||||
items = [
|
||||
{"title": r.get("title", ""), "url": r.get("href", ""), "content": r.get("body", "")}
|
||||
for r in raw
|
||||
]
|
||||
return _format_results(query, items, n)
|
||||
except Exception as e:
|
||||
logger.warning("DuckDuckGo search failed: {}", e)
|
||||
return f"Error: DuckDuckGo search failed ({e})"
|
||||
|
||||
|
||||
class WebFetchTool(Tool):
|
||||
"""Fetch and extract content from a URL using Readability."""
|
||||
"""Fetch and extract content from a URL."""
|
||||
|
||||
name = "web_fetch"
|
||||
description = "Fetch URL and extract readable content (HTML → markdown/text)."
|
||||
@@ -116,9 +215,9 @@ class WebFetchTool(Tool):
|
||||
"properties": {
|
||||
"url": {"type": "string", "description": "URL to fetch"},
|
||||
"extractMode": {"type": "string", "enum": ["markdown", "text"], "default": "markdown"},
|
||||
"maxChars": {"type": "integer", "minimum": 100}
|
||||
"maxChars": {"type": "integer", "minimum": 100},
|
||||
},
|
||||
"required": ["url"]
|
||||
"required": ["url"],
|
||||
}
|
||||
|
||||
def __init__(self, max_chars: int = 50000, proxy: str | None = None):
|
||||
@@ -126,15 +225,55 @@ class WebFetchTool(Tool):
|
||||
self.proxy = proxy
|
||||
|
||||
async def execute(self, url: str, extractMode: str = "markdown", maxChars: int | None = None, **kwargs: Any) -> str:
|
||||
from readability import Document
|
||||
|
||||
max_chars = maxChars or self.max_chars
|
||||
is_valid, error_msg = _validate_url(url)
|
||||
if not is_valid:
|
||||
return json.dumps({"error": f"URL validation failed: {error_msg}", "url": url}, ensure_ascii=False)
|
||||
|
||||
result = await self._fetch_jina(url, max_chars)
|
||||
if result is None:
|
||||
result = await self._fetch_readability(url, extractMode, max_chars)
|
||||
return result
|
||||
|
||||
async def _fetch_jina(self, url: str, max_chars: int) -> str | None:
|
||||
"""Try fetching via Jina Reader API. Returns None on failure."""
|
||||
try:
|
||||
headers = {"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
jina_key = os.environ.get("JINA_API_KEY", "")
|
||||
if jina_key:
|
||||
headers["Authorization"] = f"Bearer {jina_key}"
|
||||
async with httpx.AsyncClient(proxy=self.proxy, timeout=20.0) as client:
|
||||
r = await client.get(f"https://r.jina.ai/{url}", headers=headers)
|
||||
if r.status_code == 429:
|
||||
logger.debug("Jina Reader rate limited, falling back to readability")
|
||||
return None
|
||||
r.raise_for_status()
|
||||
|
||||
data = r.json().get("data", {})
|
||||
title = data.get("title", "")
|
||||
text = data.get("content", "")
|
||||
if not text:
|
||||
return None
|
||||
|
||||
if title:
|
||||
text = f"# {title}\n\n{text}"
|
||||
truncated = len(text) > max_chars
|
||||
if truncated:
|
||||
text = text[:max_chars]
|
||||
|
||||
return json.dumps({
|
||||
"url": url, "finalUrl": data.get("url", url), "status": r.status_code,
|
||||
"extractor": "jina", "truncated": truncated, "length": len(text), "text": text,
|
||||
}, ensure_ascii=False)
|
||||
except Exception as e:
|
||||
logger.debug("Jina Reader failed for {}, falling back to readability: {}", url, e)
|
||||
return None
|
||||
|
||||
async def _fetch_readability(self, url: str, extract_mode: str, max_chars: int) -> str:
|
||||
"""Local fallback using readability-lxml."""
|
||||
from readability import Document
|
||||
|
||||
try:
|
||||
logger.debug("WebFetch: {}", "proxy enabled" if self.proxy else "direct connection")
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
max_redirects=MAX_REDIRECTS,
|
||||
@@ -150,17 +289,20 @@ class WebFetchTool(Tool):
|
||||
text, extractor = json.dumps(r.json(), indent=2, ensure_ascii=False), "json"
|
||||
elif "text/html" in ctype or r.text[:256].lower().startswith(("<!doctype", "<html")):
|
||||
doc = Document(r.text)
|
||||
content = self._to_markdown(doc.summary()) if extractMode == "markdown" else _strip_tags(doc.summary())
|
||||
content = self._to_markdown(doc.summary()) if extract_mode == "markdown" else _strip_tags(doc.summary())
|
||||
text = f"# {doc.title()}\n\n{content}" if doc.title() else content
|
||||
extractor = "readability"
|
||||
else:
|
||||
text, extractor = r.text, "raw"
|
||||
|
||||
truncated = len(text) > max_chars
|
||||
if truncated: text = text[:max_chars]
|
||||
if truncated:
|
||||
text = text[:max_chars]
|
||||
|
||||
return json.dumps({"url": url, "finalUrl": str(r.url), "status": r.status_code,
|
||||
"extractor": extractor, "truncated": truncated, "length": len(text), "text": text}, ensure_ascii=False)
|
||||
return json.dumps({
|
||||
"url": url, "finalUrl": str(r.url), "status": r.status_code,
|
||||
"extractor": extractor, "truncated": truncated, "length": len(text), "text": text,
|
||||
}, ensure_ascii=False)
|
||||
except httpx.ProxyError as e:
|
||||
logger.error("WebFetch proxy error for {}: {}", url, e)
|
||||
return json.dumps({"error": f"Proxy error: {e}", "url": url}, ensure_ascii=False)
|
||||
@@ -168,11 +310,10 @@ class WebFetchTool(Tool):
|
||||
logger.error("WebFetch error for {}: {}", url, e)
|
||||
return json.dumps({"error": str(e), "url": url}, ensure_ascii=False)
|
||||
|
||||
def _to_markdown(self, html: str) -> str:
|
||||
def _to_markdown(self, html_content: str) -> str:
|
||||
"""Convert HTML to markdown."""
|
||||
# Convert links, headings, lists before stripping tags
|
||||
text = re.sub(r'<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>([\s\S]*?)</a>',
|
||||
lambda m: f'[{_strip_tags(m[2])}]({m[1]})', html, flags=re.I)
|
||||
lambda m: f'[{_strip_tags(m[2])}]({m[1]})', html_content, flags=re.I)
|
||||
text = re.sub(r'<h([1-6])[^>]*>([\s\S]*?)</h\1>',
|
||||
lambda m: f'\n{"#" * int(m[1])} {_strip_tags(m[2])}\n', text, flags=re.I)
|
||||
text = re.sub(r'<li[^>]*>([\s\S]*?)</li>', lambda m: f'\n- {_strip_tags(m[1])}', text, flags=re.I)
|
||||
|
||||
@@ -395,7 +395,7 @@ def gateway(
|
||||
model=config.agents.defaults.model,
|
||||
max_iterations=config.agents.defaults.max_tool_iterations,
|
||||
context_window_tokens=config.agents.defaults.context_window_tokens,
|
||||
brave_api_key=config.tools.web.search.api_key or None,
|
||||
web_search_config=config.tools.web.search,
|
||||
web_proxy=config.tools.web.proxy or None,
|
||||
exec_config=config.tools.exec,
|
||||
cron_service=cron,
|
||||
@@ -578,7 +578,7 @@ def agent(
|
||||
model=config.agents.defaults.model,
|
||||
max_iterations=config.agents.defaults.max_tool_iterations,
|
||||
context_window_tokens=config.agents.defaults.context_window_tokens,
|
||||
brave_api_key=config.tools.web.search.api_key or None,
|
||||
web_search_config=config.tools.web.search,
|
||||
web_proxy=config.tools.web.proxy or None,
|
||||
exec_config=config.tools.exec,
|
||||
cron_service=cron,
|
||||
|
||||
@@ -310,7 +310,9 @@ class GatewayConfig(Base):
|
||||
class WebSearchConfig(Base):
|
||||
"""Web search tool configuration."""
|
||||
|
||||
api_key: str = "" # Brave Search API key
|
||||
provider: str = "brave" # brave, tavily, duckduckgo, searxng, jina
|
||||
api_key: str = ""
|
||||
base_url: str = "" # SearXNG base URL
|
||||
max_results: int = 5
|
||||
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ dependencies = [
|
||||
"websockets>=16.0,<17.0",
|
||||
"websocket-client>=1.9.0,<2.0.0",
|
||||
"httpx>=0.28.0,<1.0.0",
|
||||
"ddgs>=9.5.5,<10.0.0",
|
||||
"oauth-cli-kit>=0.1.3,<1.0.0",
|
||||
"loguru>=0.7.3,<1.0.0",
|
||||
"readability-lxml>=0.8.4,<1.0.0",
|
||||
|
||||
162
tests/test_web_search_tool.py
Normal file
162
tests/test_web_search_tool.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Tests for multi-provider web search."""
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from nanobot.agent.tools.web import WebSearchTool
|
||||
from nanobot.config.schema import WebSearchConfig
|
||||
|
||||
|
||||
def _tool(provider: str = "brave", api_key: str = "", base_url: str = "") -> WebSearchTool:
|
||||
return WebSearchTool(config=WebSearchConfig(provider=provider, api_key=api_key, base_url=base_url))
|
||||
|
||||
|
||||
def _response(status: int = 200, json: dict | None = None) -> httpx.Response:
|
||||
"""Build a mock httpx.Response with a dummy request attached."""
|
||||
r = httpx.Response(status, json=json)
|
||||
r._request = httpx.Request("GET", "https://mock")
|
||||
return r
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_brave_search(monkeypatch):
|
||||
async def mock_get(self, url, **kw):
|
||||
assert "brave" in url
|
||||
assert kw["headers"]["X-Subscription-Token"] == "brave-key"
|
||||
return _response(json={
|
||||
"web": {"results": [{"title": "NanoBot", "url": "https://example.com", "description": "AI assistant"}]}
|
||||
})
|
||||
|
||||
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||
tool = _tool(provider="brave", api_key="brave-key")
|
||||
result = await tool.execute(query="nanobot", count=1)
|
||||
assert "NanoBot" in result
|
||||
assert "https://example.com" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tavily_search(monkeypatch):
|
||||
async def mock_post(self, url, **kw):
|
||||
assert "tavily" in url
|
||||
assert kw["headers"]["Authorization"] == "Bearer tavily-key"
|
||||
return _response(json={
|
||||
"results": [{"title": "OpenClaw", "url": "https://openclaw.io", "content": "Framework"}]
|
||||
})
|
||||
|
||||
monkeypatch.setattr(httpx.AsyncClient, "post", mock_post)
|
||||
tool = _tool(provider="tavily", api_key="tavily-key")
|
||||
result = await tool.execute(query="openclaw")
|
||||
assert "OpenClaw" in result
|
||||
assert "https://openclaw.io" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_searxng_search(monkeypatch):
|
||||
async def mock_get(self, url, **kw):
|
||||
assert "searx.example" in url
|
||||
return _response(json={
|
||||
"results": [{"title": "Result", "url": "https://example.com", "content": "SearXNG result"}]
|
||||
})
|
||||
|
||||
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||
tool = _tool(provider="searxng", base_url="https://searx.example")
|
||||
result = await tool.execute(query="test")
|
||||
assert "Result" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_duckduckgo_search(monkeypatch):
|
||||
class MockDDGS:
|
||||
def __init__(self, **kw):
|
||||
pass
|
||||
|
||||
def text(self, query, max_results=5):
|
||||
return [{"title": "DDG Result", "href": "https://ddg.example", "body": "From DuckDuckGo"}]
|
||||
|
||||
monkeypatch.setattr("nanobot.agent.tools.web.DDGS", MockDDGS, raising=False)
|
||||
import nanobot.agent.tools.web as web_mod
|
||||
monkeypatch.setattr(web_mod, "DDGS", MockDDGS, raising=False)
|
||||
|
||||
from ddgs import DDGS
|
||||
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||
|
||||
tool = _tool(provider="duckduckgo")
|
||||
result = await tool.execute(query="hello")
|
||||
assert "DDG Result" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_brave_fallback_to_duckduckgo_when_no_key(monkeypatch):
|
||||
class MockDDGS:
|
||||
def __init__(self, **kw):
|
||||
pass
|
||||
|
||||
def text(self, query, max_results=5):
|
||||
return [{"title": "Fallback", "href": "https://ddg.example", "body": "DuckDuckGo fallback"}]
|
||||
|
||||
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||
monkeypatch.delenv("BRAVE_API_KEY", raising=False)
|
||||
|
||||
tool = _tool(provider="brave", api_key="")
|
||||
result = await tool.execute(query="test")
|
||||
assert "Fallback" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_jina_search(monkeypatch):
|
||||
async def mock_get(self, url, **kw):
|
||||
assert "s.jina.ai" in str(url)
|
||||
assert kw["headers"]["Authorization"] == "Bearer jina-key"
|
||||
return _response(json={
|
||||
"data": [{"title": "Jina Result", "url": "https://jina.ai", "content": "AI search"}]
|
||||
})
|
||||
|
||||
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||
tool = _tool(provider="jina", api_key="jina-key")
|
||||
result = await tool.execute(query="test")
|
||||
assert "Jina Result" in result
|
||||
assert "https://jina.ai" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unknown_provider():
|
||||
tool = _tool(provider="unknown")
|
||||
result = await tool.execute(query="test")
|
||||
assert "unknown" in result
|
||||
assert "Error" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_provider_is_brave(monkeypatch):
|
||||
async def mock_get(self, url, **kw):
|
||||
assert "brave" in url
|
||||
return _response(json={"web": {"results": []}})
|
||||
|
||||
monkeypatch.setattr(httpx.AsyncClient, "get", mock_get)
|
||||
tool = _tool(provider="", api_key="test-key")
|
||||
result = await tool.execute(query="test")
|
||||
assert "No results" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_searxng_no_base_url_falls_back(monkeypatch):
|
||||
class MockDDGS:
|
||||
def __init__(self, **kw):
|
||||
pass
|
||||
|
||||
def text(self, query, max_results=5):
|
||||
return [{"title": "Fallback", "href": "https://ddg.example", "body": "fallback"}]
|
||||
|
||||
monkeypatch.setattr("ddgs.DDGS", MockDDGS)
|
||||
monkeypatch.delenv("SEARXNG_BASE_URL", raising=False)
|
||||
|
||||
tool = _tool(provider="searxng", base_url="")
|
||||
result = await tool.execute(query="test")
|
||||
assert "Fallback" in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_searxng_invalid_url():
|
||||
tool = _tool(provider="searxng", base_url="not-a-url")
|
||||
result = await tool.execute(query="test")
|
||||
assert "Error" in result
|
||||
Reference in New Issue
Block a user