From 411b059dd22884ba7b54d6c8c00bcc4add95bfb0 Mon Sep 17 00:00:00 2001 From: Xubin Ren Date: Sat, 14 Mar 2026 09:29:56 +0000 Subject: [PATCH] refactor: replace with structured post-run evaluation - Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution - Remove magic token injection from heartbeat and cron prompts - Clean session history (no more pollution) - Add tests for evaluator and updated heartbeat three-phase flow --- nanobot/cli/commands.py | 22 ++++---- nanobot/heartbeat/service.py | 21 ++++---- nanobot/utils/evaluator.py | 92 +++++++++++++++++++++++++++++++++ tests/test_evaluator.py | 63 ++++++++++++++++++++++ tests/test_heartbeat_service.py | 92 +++++++++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 18 deletions(-) create mode 100644 nanobot/utils/evaluator.py create mode 100644 tests/test_evaluator.py diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py index c4aa868..d8aa411 100644 --- a/nanobot/cli/commands.py +++ b/nanobot/cli/commands.py @@ -448,14 +448,14 @@ def gateway( """Execute a cron job through the agent.""" from nanobot.agent.tools.cron import CronTool from nanobot.agent.tools.message import MessageTool + from nanobot.utils.evaluator import evaluate_response + reminder_note = ( "[Scheduled Task] Timer finished.\n\n" f"Task '{job.name}' has been triggered.\n" f"Scheduled instruction: {job.payload.message}" - "**IMPORTANT NOTICE:** If there is nothing material to report, reply only with ." ) - # Prevent the agent from scheduling new cron jobs during execution cron_tool = agent.tools.get("cron") cron_token = None if isinstance(cron_tool, CronTool): @@ -475,13 +475,17 @@ def gateway( if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn: return response - if job.payload.deliver and job.payload.to and response and "" not in response: - from nanobot.bus.events import OutboundMessage - await bus.publish_outbound(OutboundMessage( - channel=job.payload.channel or "cli", - chat_id=job.payload.to, - content=response - )) + if job.payload.deliver and job.payload.to and response: + should_notify = await evaluate_response( + response, job.payload.message, provider, agent.model, + ) + if should_notify: + from nanobot.bus.events import OutboundMessage + await bus.publish_outbound(OutboundMessage( + channel=job.payload.channel or "cli", + chat_id=job.payload.to, + content=response, + )) return response cron.on_job = on_cron_job diff --git a/nanobot/heartbeat/service.py b/nanobot/heartbeat/service.py index 916c813..2242802 100644 --- a/nanobot/heartbeat/service.py +++ b/nanobot/heartbeat/service.py @@ -139,6 +139,8 @@ class HeartbeatService: async def _tick(self) -> None: """Execute a single heartbeat tick.""" + from nanobot.utils.evaluator import evaluate_response + content = self._read_heartbeat_file() if not content: logger.debug("Heartbeat: HEARTBEAT.md missing or empty") @@ -153,18 +155,19 @@ class HeartbeatService: logger.info("Heartbeat: OK (nothing to report)") return - taskmessage = tasks + "\n\n**IMPORTANT NOTICE:** If there is nothing material to report, reply only with ." - logger.info("Heartbeat: tasks found, executing...") if self.on_execute: - response = await self.on_execute(taskmessage) + response = await self.on_execute(tasks) - if response and "" in response: - logger.info("Heartbeat: OK (silenced by agent)") - return - if response and self.on_notify: - logger.info("Heartbeat: completed, delivering response") - await self.on_notify(response) + if response: + should_notify = await evaluate_response( + response, tasks, self.provider, self.model, + ) + if should_notify and self.on_notify: + logger.info("Heartbeat: completed, delivering response") + await self.on_notify(response) + else: + logger.info("Heartbeat: silenced by post-run evaluation") except Exception: logger.exception("Heartbeat execution failed") diff --git a/nanobot/utils/evaluator.py b/nanobot/utils/evaluator.py new file mode 100644 index 0000000..6110471 --- /dev/null +++ b/nanobot/utils/evaluator.py @@ -0,0 +1,92 @@ +"""Post-run evaluation for background tasks (heartbeat & cron). + +After the agent executes a background task, this module makes a lightweight +LLM call to decide whether the result warrants notifying the user. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from loguru import logger + +if TYPE_CHECKING: + from nanobot.providers.base import LLMProvider + +_EVALUATE_TOOL = [ + { + "type": "function", + "function": { + "name": "evaluate_notification", + "description": "Decide whether the user should be notified about this background task result.", + "parameters": { + "type": "object", + "properties": { + "should_notify": { + "type": "boolean", + "description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress", + }, + "reason": { + "type": "string", + "description": "One-sentence reason for the decision", + }, + }, + "required": ["should_notify"], + }, + }, + } +] + +_SYSTEM_PROMPT = ( + "You are a notification gate for a background agent. " + "You will be given the original task and the agent's response. " + "Call the evaluate_notification tool to decide whether the user " + "should be notified.\n\n" + "Notify when the response contains actionable information, errors, " + "completed deliverables, or anything the user explicitly asked to " + "be reminded about.\n\n" + "Suppress when the response is a routine status check with nothing " + "new, a confirmation that everything is normal, or essentially empty." +) + + +async def evaluate_response( + response: str, + task_context: str, + provider: LLMProvider, + model: str, +) -> bool: + """Decide whether a background-task result should be delivered to the user. + + Uses a lightweight tool-call LLM request (same pattern as heartbeat + ``_decide()``). Falls back to ``True`` (notify) on any failure so + that important messages are never silently dropped. + """ + try: + llm_response = await provider.chat_with_retry( + messages=[ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": ( + f"## Original task\n{task_context}\n\n" + f"## Agent response\n{response}" + )}, + ], + tools=_EVALUATE_TOOL, + model=model, + max_tokens=256, + temperature=0.0, + ) + + if not llm_response.has_tool_calls: + logger.warning("evaluate_response: no tool call returned, defaulting to notify") + return True + + args = llm_response.tool_calls[0].arguments + should_notify = args.get("should_notify", True) + reason = args.get("reason", "") + logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason) + return bool(should_notify) + + except Exception: + logger.exception("evaluate_response failed, defaulting to notify") + return True diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py new file mode 100644 index 0000000..08d068b --- /dev/null +++ b/tests/test_evaluator.py @@ -0,0 +1,63 @@ +import pytest + +from nanobot.utils.evaluator import evaluate_response +from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest + + +class DummyProvider(LLMProvider): + def __init__(self, responses: list[LLMResponse]): + super().__init__() + self._responses = list(responses) + + async def chat(self, *args, **kwargs) -> LLMResponse: + if self._responses: + return self._responses.pop(0) + return LLMResponse(content="", tool_calls=[]) + + def get_default_model(self) -> str: + return "test-model" + + +def _eval_tool_call(should_notify: bool, reason: str = "") -> LLMResponse: + return LLMResponse( + content="", + tool_calls=[ + ToolCallRequest( + id="eval_1", + name="evaluate_notification", + arguments={"should_notify": should_notify, "reason": reason}, + ) + ], + ) + + +@pytest.mark.asyncio +async def test_should_notify_true() -> None: + provider = DummyProvider([_eval_tool_call(True, "user asked to be reminded")]) + result = await evaluate_response("Task completed with results", "check emails", provider, "m") + assert result is True + + +@pytest.mark.asyncio +async def test_should_notify_false() -> None: + provider = DummyProvider([_eval_tool_call(False, "routine check, nothing new")]) + result = await evaluate_response("All clear, no updates", "check status", provider, "m") + assert result is False + + +@pytest.mark.asyncio +async def test_fallback_on_error() -> None: + class FailingProvider(DummyProvider): + async def chat(self, *args, **kwargs) -> LLMResponse: + raise RuntimeError("provider down") + + provider = FailingProvider([]) + result = await evaluate_response("some response", "some task", provider, "m") + assert result is True + + +@pytest.mark.asyncio +async def test_no_tool_call_fallback() -> None: + provider = DummyProvider([LLMResponse(content="I think you should notify", tool_calls=[])]) + result = await evaluate_response("some response", "some task", provider, "m") + assert result is True diff --git a/tests/test_heartbeat_service.py b/tests/test_heartbeat_service.py index 9ce8912..2a6b20e 100644 --- a/tests/test_heartbeat_service.py +++ b/tests/test_heartbeat_service.py @@ -123,6 +123,98 @@ async def test_trigger_now_returns_none_when_decision_is_skip(tmp_path) -> None: assert await service.trigger_now() is None +@pytest.mark.asyncio +async def test_tick_notifies_when_evaluator_says_yes(tmp_path, monkeypatch) -> None: + """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=notify -> on_notify called.""" + (tmp_path / "HEARTBEAT.md").write_text("- [ ] check deployments", encoding="utf-8") + + provider = DummyProvider([ + LLMResponse( + content="", + tool_calls=[ + ToolCallRequest( + id="hb_1", + name="heartbeat", + arguments={"action": "run", "tasks": "check deployments"}, + ) + ], + ), + ]) + + executed: list[str] = [] + notified: list[str] = [] + + async def _on_execute(tasks: str) -> str: + executed.append(tasks) + return "deployment failed on staging" + + async def _on_notify(response: str) -> None: + notified.append(response) + + service = HeartbeatService( + workspace=tmp_path, + provider=provider, + model="openai/gpt-4o-mini", + on_execute=_on_execute, + on_notify=_on_notify, + ) + + async def _eval_notify(*a, **kw): + return True + + monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_notify) + + await service._tick() + assert executed == ["check deployments"] + assert notified == ["deployment failed on staging"] + + +@pytest.mark.asyncio +async def test_tick_suppresses_when_evaluator_says_no(tmp_path, monkeypatch) -> None: + """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=silent -> on_notify NOT called.""" + (tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8") + + provider = DummyProvider([ + LLMResponse( + content="", + tool_calls=[ + ToolCallRequest( + id="hb_1", + name="heartbeat", + arguments={"action": "run", "tasks": "check status"}, + ) + ], + ), + ]) + + executed: list[str] = [] + notified: list[str] = [] + + async def _on_execute(tasks: str) -> str: + executed.append(tasks) + return "everything is fine, no issues" + + async def _on_notify(response: str) -> None: + notified.append(response) + + service = HeartbeatService( + workspace=tmp_path, + provider=provider, + model="openai/gpt-4o-mini", + on_execute=_on_execute, + on_notify=_on_notify, + ) + + async def _eval_silent(*a, **kw): + return False + + monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_silent) + + await service._tick() + assert executed == ["check status"] + assert notified == [] + + @pytest.mark.asyncio async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None: provider = DummyProvider([