refactor: replace <SILENT_OK> with structured post-run evaluation
- Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution - Remove magic token injection from heartbeat and cron prompts - Clean session history (no more <SILENT_OK> pollution) - Add tests for evaluator and updated heartbeat three-phase flow
This commit is contained in:
@@ -448,14 +448,14 @@ def gateway(
|
|||||||
"""Execute a cron job through the agent."""
|
"""Execute a cron job through the agent."""
|
||||||
from nanobot.agent.tools.cron import CronTool
|
from nanobot.agent.tools.cron import CronTool
|
||||||
from nanobot.agent.tools.message import MessageTool
|
from nanobot.agent.tools.message import MessageTool
|
||||||
|
from nanobot.utils.evaluator import evaluate_response
|
||||||
|
|
||||||
reminder_note = (
|
reminder_note = (
|
||||||
"[Scheduled Task] Timer finished.\n\n"
|
"[Scheduled Task] Timer finished.\n\n"
|
||||||
f"Task '{job.name}' has been triggered.\n"
|
f"Task '{job.name}' has been triggered.\n"
|
||||||
f"Scheduled instruction: {job.payload.message}"
|
f"Scheduled instruction: {job.payload.message}"
|
||||||
"**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Prevent the agent from scheduling new cron jobs during execution
|
|
||||||
cron_tool = agent.tools.get("cron")
|
cron_tool = agent.tools.get("cron")
|
||||||
cron_token = None
|
cron_token = None
|
||||||
if isinstance(cron_tool, CronTool):
|
if isinstance(cron_tool, CronTool):
|
||||||
@@ -475,13 +475,17 @@ def gateway(
|
|||||||
if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn:
|
if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
if job.payload.deliver and job.payload.to and response and "<SILENT_OK>" not in response:
|
if job.payload.deliver and job.payload.to and response:
|
||||||
from nanobot.bus.events import OutboundMessage
|
should_notify = await evaluate_response(
|
||||||
await bus.publish_outbound(OutboundMessage(
|
response, job.payload.message, provider, agent.model,
|
||||||
channel=job.payload.channel or "cli",
|
)
|
||||||
chat_id=job.payload.to,
|
if should_notify:
|
||||||
content=response
|
from nanobot.bus.events import OutboundMessage
|
||||||
))
|
await bus.publish_outbound(OutboundMessage(
|
||||||
|
channel=job.payload.channel or "cli",
|
||||||
|
chat_id=job.payload.to,
|
||||||
|
content=response,
|
||||||
|
))
|
||||||
return response
|
return response
|
||||||
cron.on_job = on_cron_job
|
cron.on_job = on_cron_job
|
||||||
|
|
||||||
|
|||||||
@@ -139,6 +139,8 @@ class HeartbeatService:
|
|||||||
|
|
||||||
async def _tick(self) -> None:
|
async def _tick(self) -> None:
|
||||||
"""Execute a single heartbeat tick."""
|
"""Execute a single heartbeat tick."""
|
||||||
|
from nanobot.utils.evaluator import evaluate_response
|
||||||
|
|
||||||
content = self._read_heartbeat_file()
|
content = self._read_heartbeat_file()
|
||||||
if not content:
|
if not content:
|
||||||
logger.debug("Heartbeat: HEARTBEAT.md missing or empty")
|
logger.debug("Heartbeat: HEARTBEAT.md missing or empty")
|
||||||
@@ -153,18 +155,19 @@ class HeartbeatService:
|
|||||||
logger.info("Heartbeat: OK (nothing to report)")
|
logger.info("Heartbeat: OK (nothing to report)")
|
||||||
return
|
return
|
||||||
|
|
||||||
taskmessage = tasks + "\n\n**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
|
|
||||||
|
|
||||||
logger.info("Heartbeat: tasks found, executing...")
|
logger.info("Heartbeat: tasks found, executing...")
|
||||||
if self.on_execute:
|
if self.on_execute:
|
||||||
response = await self.on_execute(taskmessage)
|
response = await self.on_execute(tasks)
|
||||||
|
|
||||||
if response and "<SILENT_OK>" in response:
|
if response:
|
||||||
logger.info("Heartbeat: OK (silenced by agent)")
|
should_notify = await evaluate_response(
|
||||||
return
|
response, tasks, self.provider, self.model,
|
||||||
if response and self.on_notify:
|
)
|
||||||
logger.info("Heartbeat: completed, delivering response")
|
if should_notify and self.on_notify:
|
||||||
await self.on_notify(response)
|
logger.info("Heartbeat: completed, delivering response")
|
||||||
|
await self.on_notify(response)
|
||||||
|
else:
|
||||||
|
logger.info("Heartbeat: silenced by post-run evaluation")
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Heartbeat execution failed")
|
logger.exception("Heartbeat execution failed")
|
||||||
|
|
||||||
|
|||||||
92
nanobot/utils/evaluator.py
Normal file
92
nanobot/utils/evaluator.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""Post-run evaluation for background tasks (heartbeat & cron).
|
||||||
|
|
||||||
|
After the agent executes a background task, this module makes a lightweight
|
||||||
|
LLM call to decide whether the result warrants notifying the user.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from nanobot.providers.base import LLMProvider
|
||||||
|
|
||||||
|
_EVALUATE_TOOL = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "evaluate_notification",
|
||||||
|
"description": "Decide whether the user should be notified about this background task result.",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"should_notify": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
|
||||||
|
},
|
||||||
|
"reason": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "One-sentence reason for the decision",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["should_notify"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
_SYSTEM_PROMPT = (
|
||||||
|
"You are a notification gate for a background agent. "
|
||||||
|
"You will be given the original task and the agent's response. "
|
||||||
|
"Call the evaluate_notification tool to decide whether the user "
|
||||||
|
"should be notified.\n\n"
|
||||||
|
"Notify when the response contains actionable information, errors, "
|
||||||
|
"completed deliverables, or anything the user explicitly asked to "
|
||||||
|
"be reminded about.\n\n"
|
||||||
|
"Suppress when the response is a routine status check with nothing "
|
||||||
|
"new, a confirmation that everything is normal, or essentially empty."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def evaluate_response(
|
||||||
|
response: str,
|
||||||
|
task_context: str,
|
||||||
|
provider: LLMProvider,
|
||||||
|
model: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Decide whether a background-task result should be delivered to the user.
|
||||||
|
|
||||||
|
Uses a lightweight tool-call LLM request (same pattern as heartbeat
|
||||||
|
``_decide()``). Falls back to ``True`` (notify) on any failure so
|
||||||
|
that important messages are never silently dropped.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
llm_response = await provider.chat_with_retry(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": (
|
||||||
|
f"## Original task\n{task_context}\n\n"
|
||||||
|
f"## Agent response\n{response}"
|
||||||
|
)},
|
||||||
|
],
|
||||||
|
tools=_EVALUATE_TOOL,
|
||||||
|
model=model,
|
||||||
|
max_tokens=256,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not llm_response.has_tool_calls:
|
||||||
|
logger.warning("evaluate_response: no tool call returned, defaulting to notify")
|
||||||
|
return True
|
||||||
|
|
||||||
|
args = llm_response.tool_calls[0].arguments
|
||||||
|
should_notify = args.get("should_notify", True)
|
||||||
|
reason = args.get("reason", "")
|
||||||
|
logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
|
||||||
|
return bool(should_notify)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
logger.exception("evaluate_response failed, defaulting to notify")
|
||||||
|
return True
|
||||||
63
tests/test_evaluator.py
Normal file
63
tests/test_evaluator.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from nanobot.utils.evaluator import evaluate_response
|
||||||
|
from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
|
||||||
|
|
||||||
|
|
||||||
|
class DummyProvider(LLMProvider):
|
||||||
|
def __init__(self, responses: list[LLMResponse]):
|
||||||
|
super().__init__()
|
||||||
|
self._responses = list(responses)
|
||||||
|
|
||||||
|
async def chat(self, *args, **kwargs) -> LLMResponse:
|
||||||
|
if self._responses:
|
||||||
|
return self._responses.pop(0)
|
||||||
|
return LLMResponse(content="", tool_calls=[])
|
||||||
|
|
||||||
|
def get_default_model(self) -> str:
|
||||||
|
return "test-model"
|
||||||
|
|
||||||
|
|
||||||
|
def _eval_tool_call(should_notify: bool, reason: str = "") -> LLMResponse:
|
||||||
|
return LLMResponse(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRequest(
|
||||||
|
id="eval_1",
|
||||||
|
name="evaluate_notification",
|
||||||
|
arguments={"should_notify": should_notify, "reason": reason},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_should_notify_true() -> None:
|
||||||
|
provider = DummyProvider([_eval_tool_call(True, "user asked to be reminded")])
|
||||||
|
result = await evaluate_response("Task completed with results", "check emails", provider, "m")
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_should_notify_false() -> None:
|
||||||
|
provider = DummyProvider([_eval_tool_call(False, "routine check, nothing new")])
|
||||||
|
result = await evaluate_response("All clear, no updates", "check status", provider, "m")
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fallback_on_error() -> None:
|
||||||
|
class FailingProvider(DummyProvider):
|
||||||
|
async def chat(self, *args, **kwargs) -> LLMResponse:
|
||||||
|
raise RuntimeError("provider down")
|
||||||
|
|
||||||
|
provider = FailingProvider([])
|
||||||
|
result = await evaluate_response("some response", "some task", provider, "m")
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_no_tool_call_fallback() -> None:
|
||||||
|
provider = DummyProvider([LLMResponse(content="I think you should notify", tool_calls=[])])
|
||||||
|
result = await evaluate_response("some response", "some task", provider, "m")
|
||||||
|
assert result is True
|
||||||
@@ -123,6 +123,98 @@ async def test_trigger_now_returns_none_when_decision_is_skip(tmp_path) -> None:
|
|||||||
assert await service.trigger_now() is None
|
assert await service.trigger_now() is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tick_notifies_when_evaluator_says_yes(tmp_path, monkeypatch) -> None:
|
||||||
|
"""Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=notify -> on_notify called."""
|
||||||
|
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check deployments", encoding="utf-8")
|
||||||
|
|
||||||
|
provider = DummyProvider([
|
||||||
|
LLMResponse(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRequest(
|
||||||
|
id="hb_1",
|
||||||
|
name="heartbeat",
|
||||||
|
arguments={"action": "run", "tasks": "check deployments"},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
])
|
||||||
|
|
||||||
|
executed: list[str] = []
|
||||||
|
notified: list[str] = []
|
||||||
|
|
||||||
|
async def _on_execute(tasks: str) -> str:
|
||||||
|
executed.append(tasks)
|
||||||
|
return "deployment failed on staging"
|
||||||
|
|
||||||
|
async def _on_notify(response: str) -> None:
|
||||||
|
notified.append(response)
|
||||||
|
|
||||||
|
service = HeartbeatService(
|
||||||
|
workspace=tmp_path,
|
||||||
|
provider=provider,
|
||||||
|
model="openai/gpt-4o-mini",
|
||||||
|
on_execute=_on_execute,
|
||||||
|
on_notify=_on_notify,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _eval_notify(*a, **kw):
|
||||||
|
return True
|
||||||
|
|
||||||
|
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_notify)
|
||||||
|
|
||||||
|
await service._tick()
|
||||||
|
assert executed == ["check deployments"]
|
||||||
|
assert notified == ["deployment failed on staging"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tick_suppresses_when_evaluator_says_no(tmp_path, monkeypatch) -> None:
|
||||||
|
"""Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=silent -> on_notify NOT called."""
|
||||||
|
(tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8")
|
||||||
|
|
||||||
|
provider = DummyProvider([
|
||||||
|
LLMResponse(
|
||||||
|
content="",
|
||||||
|
tool_calls=[
|
||||||
|
ToolCallRequest(
|
||||||
|
id="hb_1",
|
||||||
|
name="heartbeat",
|
||||||
|
arguments={"action": "run", "tasks": "check status"},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
])
|
||||||
|
|
||||||
|
executed: list[str] = []
|
||||||
|
notified: list[str] = []
|
||||||
|
|
||||||
|
async def _on_execute(tasks: str) -> str:
|
||||||
|
executed.append(tasks)
|
||||||
|
return "everything is fine, no issues"
|
||||||
|
|
||||||
|
async def _on_notify(response: str) -> None:
|
||||||
|
notified.append(response)
|
||||||
|
|
||||||
|
service = HeartbeatService(
|
||||||
|
workspace=tmp_path,
|
||||||
|
provider=provider,
|
||||||
|
model="openai/gpt-4o-mini",
|
||||||
|
on_execute=_on_execute,
|
||||||
|
on_notify=_on_notify,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _eval_silent(*a, **kw):
|
||||||
|
return False
|
||||||
|
|
||||||
|
monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_silent)
|
||||||
|
|
||||||
|
await service._tick()
|
||||||
|
assert executed == ["check status"]
|
||||||
|
assert notified == []
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None:
|
async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None:
|
||||||
provider = DummyProvider([
|
provider = DummyProvider([
|
||||||
|
|||||||
Reference in New Issue
Block a user