From 411b059dd22884ba7b54d6c8c00bcc4add95bfb0 Mon Sep 17 00:00:00 2001
From: Xubin Ren <xubinrencs@gmail.com>
Date: Sat, 14 Mar 2026 09:29:56 +0000
Subject: [PATCH] refactor: replace <SILENT_OK> with structured post-run
 evaluation

- Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution
- Remove magic token injection from heartbeat and cron prompts
- Clean session history (no more <SILENT_OK> pollution)
- Add tests for evaluator and updated heartbeat three-phase flow
---
 nanobot/cli/commands.py         | 22 ++++----
 nanobot/heartbeat/service.py    | 21 ++++----
 nanobot/utils/evaluator.py      | 92 +++++++++++++++++++++++++++++++++
 tests/test_evaluator.py         | 63 ++++++++++++++++++++++
 tests/test_heartbeat_service.py | 92 +++++++++++++++++++++++++++++++++
 5 files changed, 272 insertions(+), 18 deletions(-)
 create mode 100644 nanobot/utils/evaluator.py
 create mode 100644 tests/test_evaluator.py

diff --git a/nanobot/cli/commands.py b/nanobot/cli/commands.py
index c4aa868..d8aa411 100644
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -448,14 +448,14 @@ def gateway(
         """Execute a cron job through the agent."""
         from nanobot.agent.tools.cron import CronTool
         from nanobot.agent.tools.message import MessageTool
+        from nanobot.utils.evaluator import evaluate_response
+
         reminder_note = (
             "[Scheduled Task] Timer finished.\n\n"
             f"Task '{job.name}' has been triggered.\n"
             f"Scheduled instruction: {job.payload.message}"
-            "**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
         )
 
-        # Prevent the agent from scheduling new cron jobs during execution
         cron_tool = agent.tools.get("cron")
         cron_token = None
         if isinstance(cron_tool, CronTool):
@@ -475,13 +475,17 @@ def gateway(
         if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn:
             return response
 
-        if job.payload.deliver and job.payload.to and response and "<SILENT_OK>" not in response:
-            from nanobot.bus.events import OutboundMessage
-            await bus.publish_outbound(OutboundMessage(
-                channel=job.payload.channel or "cli",
-                chat_id=job.payload.to,
-                content=response
-            ))
+        if job.payload.deliver and job.payload.to and response:
+            should_notify = await evaluate_response(
+                response, job.payload.message, provider, agent.model,
+            )
+            if should_notify:
+                from nanobot.bus.events import OutboundMessage
+                await bus.publish_outbound(OutboundMessage(
+                    channel=job.payload.channel or "cli",
+                    chat_id=job.payload.to,
+                    content=response,
+                ))
         return response
     cron.on_job = on_cron_job
 
diff --git a/nanobot/heartbeat/service.py b/nanobot/heartbeat/service.py
index 916c813..2242802 100644
--- a/nanobot/heartbeat/service.py
+++ b/nanobot/heartbeat/service.py
@@ -139,6 +139,8 @@ class HeartbeatService:
 
     async def _tick(self) -> None:
         """Execute a single heartbeat tick."""
+        from nanobot.utils.evaluator import evaluate_response
+
         content = self._read_heartbeat_file()
         if not content:
             logger.debug("Heartbeat: HEARTBEAT.md missing or empty")
@@ -153,18 +155,19 @@ class HeartbeatService:
                 logger.info("Heartbeat: OK (nothing to report)")
                 return
 
-            taskmessage = tasks + "\n\n**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
-
             logger.info("Heartbeat: tasks found, executing...")
             if self.on_execute:
-                response = await self.on_execute(taskmessage)
+                response = await self.on_execute(tasks)
 
-                if response and "<SILENT_OK>" in response:
-                    logger.info("Heartbeat: OK (silenced by agent)")
-                    return
-                if response and self.on_notify:
-                    logger.info("Heartbeat: completed, delivering response")
-                    await self.on_notify(response)
+                if response:
+                    should_notify = await evaluate_response(
+                        response, tasks, self.provider, self.model,
+                    )
+                    if should_notify and self.on_notify:
+                        logger.info("Heartbeat: completed, delivering response")
+                        await self.on_notify(response)
+                    else:
+                        logger.info("Heartbeat: silenced by post-run evaluation")
         except Exception:
             logger.exception("Heartbeat execution failed")
 
diff --git a/nanobot/utils/evaluator.py b/nanobot/utils/evaluator.py
new file mode 100644
index 0000000..6110471
--- /dev/null
+++ b/nanobot/utils/evaluator.py
@@ -0,0 +1,92 @@
+"""Post-run evaluation for background tasks (heartbeat & cron).
+
+After the agent executes a background task, this module makes a lightweight
+LLM call to decide whether the result warrants notifying the user.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from loguru import logger
+
+if TYPE_CHECKING:
+    from nanobot.providers.base import LLMProvider
+
+_EVALUATE_TOOL = [
+    {
+        "type": "function",
+        "function": {
+            "name": "evaluate_notification",
+            "description": "Decide whether the user should be notified about this background task result.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "should_notify": {
+                        "type": "boolean",
+                        "description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
+                    },
+                    "reason": {
+                        "type": "string",
+                        "description": "One-sentence reason for the decision",
+                    },
+                },
+                "required": ["should_notify"],
+            },
+        },
+    }
+]
+
+_SYSTEM_PROMPT = (
+    "You are a notification gate for a background agent. "
+    "You will be given the original task and the agent's response. "
+    "Call the evaluate_notification tool to decide whether the user "
+    "should be notified.\n\n"
+    "Notify when the response contains actionable information, errors, "
+    "completed deliverables, or anything the user explicitly asked to "
+    "be reminded about.\n\n"
+    "Suppress when the response is a routine status check with nothing "
+    "new, a confirmation that everything is normal, or essentially empty."
+)
+
+
+async def evaluate_response(
+    response: str,
+    task_context: str,
+    provider: LLMProvider,
+    model: str,
+) -> bool:
+    """Decide whether a background-task result should be delivered to the user.
+
+    Uses a lightweight tool-call LLM request (same pattern as heartbeat
+    ``_decide()``).  Falls back to ``True`` (notify) on any failure so
+    that important messages are never silently dropped.
+    """
+    try:
+        llm_response = await provider.chat_with_retry(
+            messages=[
+                {"role": "system", "content": _SYSTEM_PROMPT},
+                {"role": "user", "content": (
+                    f"## Original task\n{task_context}\n\n"
+                    f"## Agent response\n{response}"
+                )},
+            ],
+            tools=_EVALUATE_TOOL,
+            model=model,
+            max_tokens=256,
+            temperature=0.0,
+        )
+
+        if not llm_response.has_tool_calls:
+            logger.warning("evaluate_response: no tool call returned, defaulting to notify")
+            return True
+
+        args = llm_response.tool_calls[0].arguments
+        should_notify = args.get("should_notify", True)
+        reason = args.get("reason", "")
+        logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
+        return bool(should_notify)
+
+    except Exception:
+        logger.exception("evaluate_response failed, defaulting to notify")
+        return True
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
new file mode 100644
index 0000000..08d068b
--- /dev/null
+++ b/tests/test_evaluator.py
@@ -0,0 +1,63 @@
+import pytest
+
+from nanobot.utils.evaluator import evaluate_response
+from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
+
+
+class DummyProvider(LLMProvider):
+    def __init__(self, responses: list[LLMResponse]):
+        super().__init__()
+        self._responses = list(responses)
+
+    async def chat(self, *args, **kwargs) -> LLMResponse:
+        if self._responses:
+            return self._responses.pop(0)
+        return LLMResponse(content="", tool_calls=[])
+
+    def get_default_model(self) -> str:
+        return "test-model"
+
+
+def _eval_tool_call(should_notify: bool, reason: str = "") -> LLMResponse:
+    return LLMResponse(
+        content="",
+        tool_calls=[
+            ToolCallRequest(
+                id="eval_1",
+                name="evaluate_notification",
+                arguments={"should_notify": should_notify, "reason": reason},
+            )
+        ],
+    )
+
+
+@pytest.mark.asyncio
+async def test_should_notify_true() -> None:
+    provider = DummyProvider([_eval_tool_call(True, "user asked to be reminded")])
+    result = await evaluate_response("Task completed with results", "check emails", provider, "m")
+    assert result is True
+
+
+@pytest.mark.asyncio
+async def test_should_notify_false() -> None:
+    provider = DummyProvider([_eval_tool_call(False, "routine check, nothing new")])
+    result = await evaluate_response("All clear, no updates", "check status", provider, "m")
+    assert result is False
+
+
+@pytest.mark.asyncio
+async def test_fallback_on_error() -> None:
+    class FailingProvider(DummyProvider):
+        async def chat(self, *args, **kwargs) -> LLMResponse:
+            raise RuntimeError("provider down")
+
+    provider = FailingProvider([])
+    result = await evaluate_response("some response", "some task", provider, "m")
+    assert result is True
+
+
+@pytest.mark.asyncio
+async def test_no_tool_call_fallback() -> None:
+    provider = DummyProvider([LLMResponse(content="I think you should notify", tool_calls=[])])
+    result = await evaluate_response("some response", "some task", provider, "m")
+    assert result is True
diff --git a/tests/test_heartbeat_service.py b/tests/test_heartbeat_service.py
index 9ce8912..2a6b20e 100644
--- a/tests/test_heartbeat_service.py
+++ b/tests/test_heartbeat_service.py
@@ -123,6 +123,98 @@ async def test_trigger_now_returns_none_when_decision_is_skip(tmp_path) -> None:
     assert await service.trigger_now() is None
 
 
+@pytest.mark.asyncio
+async def test_tick_notifies_when_evaluator_says_yes(tmp_path, monkeypatch) -> None:
+    """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=notify -> on_notify called."""
+    (tmp_path / "HEARTBEAT.md").write_text("- [ ] check deployments", encoding="utf-8")
+
+    provider = DummyProvider([
+        LLMResponse(
+            content="",
+            tool_calls=[
+                ToolCallRequest(
+                    id="hb_1",
+                    name="heartbeat",
+                    arguments={"action": "run", "tasks": "check deployments"},
+                )
+            ],
+        ),
+    ])
+
+    executed: list[str] = []
+    notified: list[str] = []
+
+    async def _on_execute(tasks: str) -> str:
+        executed.append(tasks)
+        return "deployment failed on staging"
+
+    async def _on_notify(response: str) -> None:
+        notified.append(response)
+
+    service = HeartbeatService(
+        workspace=tmp_path,
+        provider=provider,
+        model="openai/gpt-4o-mini",
+        on_execute=_on_execute,
+        on_notify=_on_notify,
+    )
+
+    async def _eval_notify(*a, **kw):
+        return True
+
+    monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_notify)
+
+    await service._tick()
+    assert executed == ["check deployments"]
+    assert notified == ["deployment failed on staging"]
+
+
+@pytest.mark.asyncio
+async def test_tick_suppresses_when_evaluator_says_no(tmp_path, monkeypatch) -> None:
+    """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=silent -> on_notify NOT called."""
+    (tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8")
+
+    provider = DummyProvider([
+        LLMResponse(
+            content="",
+            tool_calls=[
+                ToolCallRequest(
+                    id="hb_1",
+                    name="heartbeat",
+                    arguments={"action": "run", "tasks": "check status"},
+                )
+            ],
+        ),
+    ])
+
+    executed: list[str] = []
+    notified: list[str] = []
+
+    async def _on_execute(tasks: str) -> str:
+        executed.append(tasks)
+        return "everything is fine, no issues"
+
+    async def _on_notify(response: str) -> None:
+        notified.append(response)
+
+    service = HeartbeatService(
+        workspace=tmp_path,
+        provider=provider,
+        model="openai/gpt-4o-mini",
+        on_execute=_on_execute,
+        on_notify=_on_notify,
+    )
+
+    async def _eval_silent(*a, **kw):
+        return False
+
+    monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_silent)
+
+    await service._tick()
+    assert executed == ["check status"]
+    assert notified == []
+
+
 @pytest.mark.asyncio
 async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None:
     provider = DummyProvider([