refactor: replace <SILENT_OK> with structured post-run evaluation

- Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution
- Remove magic token injection from heartbeat and cron prompts
- Clean session history (no more <SILENT_OK> pollution)
- Add tests for evaluator and updated heartbeat three-phase flow
This commit is contained in:
Xubin Ren
2026-03-14 09:29:56 +00:00
committed by Xubin Ren
parent e6c1f520ac
commit 411b059dd2
5 changed files with 272 additions and 18 deletions

View File

@@ -448,14 +448,14 @@ def gateway(
"""Execute a cron job through the agent."""
from nanobot.agent.tools.cron import CronTool
from nanobot.agent.tools.message import MessageTool
from nanobot.utils.evaluator import evaluate_response
reminder_note = (
"[Scheduled Task] Timer finished.\n\n"
f"Task '{job.name}' has been triggered.\n"
f"Scheduled instruction: {job.payload.message}"
"**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
)
# Prevent the agent from scheduling new cron jobs during execution
cron_tool = agent.tools.get("cron")
cron_token = None
if isinstance(cron_tool, CronTool):
@@ -475,13 +475,17 @@ def gateway(
if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn:
return response
if job.payload.deliver and job.payload.to and response and "<SILENT_OK>" not in response:
from nanobot.bus.events import OutboundMessage
await bus.publish_outbound(OutboundMessage(
channel=job.payload.channel or "cli",
chat_id=job.payload.to,
content=response
))
if job.payload.deliver and job.payload.to and response:
should_notify = await evaluate_response(
response, job.payload.message, provider, agent.model,
)
if should_notify:
from nanobot.bus.events import OutboundMessage
await bus.publish_outbound(OutboundMessage(
channel=job.payload.channel or "cli",
chat_id=job.payload.to,
content=response,
))
return response
cron.on_job = on_cron_job

View File

@@ -139,6 +139,8 @@ class HeartbeatService:
async def _tick(self) -> None:
"""Execute a single heartbeat tick."""
from nanobot.utils.evaluator import evaluate_response
content = self._read_heartbeat_file()
if not content:
logger.debug("Heartbeat: HEARTBEAT.md missing or empty")
@@ -153,18 +155,19 @@ class HeartbeatService:
logger.info("Heartbeat: OK (nothing to report)")
return
taskmessage = tasks + "\n\n**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
logger.info("Heartbeat: tasks found, executing...")
if self.on_execute:
response = await self.on_execute(taskmessage)
response = await self.on_execute(tasks)
if response and "<SILENT_OK>" in response:
logger.info("Heartbeat: OK (silenced by agent)")
return
if response and self.on_notify:
logger.info("Heartbeat: completed, delivering response")
await self.on_notify(response)
if response:
should_notify = await evaluate_response(
response, tasks, self.provider, self.model,
)
if should_notify and self.on_notify:
logger.info("Heartbeat: completed, delivering response")
await self.on_notify(response)
else:
logger.info("Heartbeat: silenced by post-run evaluation")
except Exception:
logger.exception("Heartbeat execution failed")

View File

@@ -0,0 +1,92 @@
"""Post-run evaluation for background tasks (heartbeat & cron).
After the agent executes a background task, this module makes a lightweight
LLM call to decide whether the result warrants notifying the user.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from loguru import logger
if TYPE_CHECKING:
from nanobot.providers.base import LLMProvider
_EVALUATE_TOOL = [
{
"type": "function",
"function": {
"name": "evaluate_notification",
"description": "Decide whether the user should be notified about this background task result.",
"parameters": {
"type": "object",
"properties": {
"should_notify": {
"type": "boolean",
"description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
},
"reason": {
"type": "string",
"description": "One-sentence reason for the decision",
},
},
"required": ["should_notify"],
},
},
}
]
_SYSTEM_PROMPT = (
"You are a notification gate for a background agent. "
"You will be given the original task and the agent's response. "
"Call the evaluate_notification tool to decide whether the user "
"should be notified.\n\n"
"Notify when the response contains actionable information, errors, "
"completed deliverables, or anything the user explicitly asked to "
"be reminded about.\n\n"
"Suppress when the response is a routine status check with nothing "
"new, a confirmation that everything is normal, or essentially empty."
)
async def evaluate_response(
response: str,
task_context: str,
provider: LLMProvider,
model: str,
) -> bool:
"""Decide whether a background-task result should be delivered to the user.
Uses a lightweight tool-call LLM request (same pattern as heartbeat
``_decide()``). Falls back to ``True`` (notify) on any failure so
that important messages are never silently dropped.
"""
try:
llm_response = await provider.chat_with_retry(
messages=[
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": (
f"## Original task\n{task_context}\n\n"
f"## Agent response\n{response}"
)},
],
tools=_EVALUATE_TOOL,
model=model,
max_tokens=256,
temperature=0.0,
)
if not llm_response.has_tool_calls:
logger.warning("evaluate_response: no tool call returned, defaulting to notify")
return True
args = llm_response.tool_calls[0].arguments
should_notify = args.get("should_notify", True)
reason = args.get("reason", "")
logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
return bool(should_notify)
except Exception:
logger.exception("evaluate_response failed, defaulting to notify")
return True

63
tests/test_evaluator.py Normal file
View File

@@ -0,0 +1,63 @@
import pytest
from nanobot.utils.evaluator import evaluate_response
from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
class DummyProvider(LLMProvider):
def __init__(self, responses: list[LLMResponse]):
super().__init__()
self._responses = list(responses)
async def chat(self, *args, **kwargs) -> LLMResponse:
if self._responses:
return self._responses.pop(0)
return LLMResponse(content="", tool_calls=[])
def get_default_model(self) -> str:
return "test-model"
def _eval_tool_call(should_notify: bool, reason: str = "") -> LLMResponse:
return LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="eval_1",
name="evaluate_notification",
arguments={"should_notify": should_notify, "reason": reason},
)
],
)
@pytest.mark.asyncio
async def test_should_notify_true() -> None:
provider = DummyProvider([_eval_tool_call(True, "user asked to be reminded")])
result = await evaluate_response("Task completed with results", "check emails", provider, "m")
assert result is True
@pytest.mark.asyncio
async def test_should_notify_false() -> None:
provider = DummyProvider([_eval_tool_call(False, "routine check, nothing new")])
result = await evaluate_response("All clear, no updates", "check status", provider, "m")
assert result is False
@pytest.mark.asyncio
async def test_fallback_on_error() -> None:
class FailingProvider(DummyProvider):
async def chat(self, *args, **kwargs) -> LLMResponse:
raise RuntimeError("provider down")
provider = FailingProvider([])
result = await evaluate_response("some response", "some task", provider, "m")
assert result is True
@pytest.mark.asyncio
async def test_no_tool_call_fallback() -> None:
provider = DummyProvider([LLMResponse(content="I think you should notify", tool_calls=[])])
result = await evaluate_response("some response", "some task", provider, "m")
assert result is True

View File

@@ -123,6 +123,98 @@ async def test_trigger_now_returns_none_when_decision_is_skip(tmp_path) -> None:
assert await service.trigger_now() is None
@pytest.mark.asyncio
async def test_tick_notifies_when_evaluator_says_yes(tmp_path, monkeypatch) -> None:
"""Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=notify -> on_notify called."""
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check deployments", encoding="utf-8")
provider = DummyProvider([
LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="hb_1",
name="heartbeat",
arguments={"action": "run", "tasks": "check deployments"},
)
],
),
])
executed: list[str] = []
notified: list[str] = []
async def _on_execute(tasks: str) -> str:
executed.append(tasks)
return "deployment failed on staging"
async def _on_notify(response: str) -> None:
notified.append(response)
service = HeartbeatService(
workspace=tmp_path,
provider=provider,
model="openai/gpt-4o-mini",
on_execute=_on_execute,
on_notify=_on_notify,
)
async def _eval_notify(*a, **kw):
return True
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_notify)
await service._tick()
assert executed == ["check deployments"]
assert notified == ["deployment failed on staging"]
@pytest.mark.asyncio
async def test_tick_suppresses_when_evaluator_says_no(tmp_path, monkeypatch) -> None:
"""Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=silent -> on_notify NOT called."""
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8")
provider = DummyProvider([
LLMResponse(
content="",
tool_calls=[
ToolCallRequest(
id="hb_1",
name="heartbeat",
arguments={"action": "run", "tasks": "check status"},
)
],
),
])
executed: list[str] = []
notified: list[str] = []
async def _on_execute(tasks: str) -> str:
executed.append(tasks)
return "everything is fine, no issues"
async def _on_notify(response: str) -> None:
notified.append(response)
service = HeartbeatService(
workspace=tmp_path,
provider=provider,
model="openai/gpt-4o-mini",
on_execute=_on_execute,
on_notify=_on_notify,
)
async def _eval_silent(*a, **kw):
return False
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_silent)
await service._tick()
assert executed == ["check status"]
assert notified == []
@pytest.mark.asyncio
async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None:
provider = DummyProvider([