refactor: replace <SILENT_OK> with structured post-run evaluation

- Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution - Remove magic token injection from heartbeat and cron prompts - Clean session history (no more <SILENT_OK> pollution) - Add tests for evaluator and updated heartbeat three-phase flow
2026-03-14 09:29:56 +00:00
parent e6c1f520ac
commit 411b059dd2
5 changed files with 272 additions and 18 deletions
--- a/nanobot/cli/commands.py
+++ b/nanobot/cli/commands.py
@@ -448,14 +448,14 @@ def gateway(
        """Execute a cron job through the agent."""
        from nanobot.agent.tools.cron import CronTool
        from nanobot.agent.tools.message import MessageTool
        from nanobot.utils.evaluator import evaluate_response
        reminder_note = (
            "[Scheduled Task] Timer finished.\n\n"
            f"Task '{job.name}' has been triggered.\n"
            f"Scheduled instruction: {job.payload.message}"
            "**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
        )
        # Prevent the agent from scheduling new cron jobs during execution
        cron_tool = agent.tools.get("cron")
        cron_token = None
        if isinstance(cron_tool, CronTool):
@@ -475,13 +475,17 @@ def gateway(
        if isinstance(message_tool, MessageTool) and message_tool._sent_in_turn:
            return response
-        if job.payload.deliver and job.payload.to and response and "<SILENT_OK>" not in response:
+        if job.payload.deliver and job.payload.to and response:
-            from nanobot.bus.events import OutboundMessage
+            should_notify = await evaluate_response(
-            await bus.publish_outbound(OutboundMessage(
+                response, job.payload.message, provider, agent.model,
-                channel=job.payload.channel or "cli",
+            )
-                chat_id=job.payload.to,
+            if should_notify:
-                content=response
+                from nanobot.bus.events import OutboundMessage
-            ))
+                await bus.publish_outbound(OutboundMessage(
                    channel=job.payload.channel or "cli",
                    chat_id=job.payload.to,
                    content=response,
                ))
        return response
    cron.on_job = on_cron_job
--- a/nanobot/heartbeat/service.py
+++ b/nanobot/heartbeat/service.py
@@ -139,6 +139,8 @@ class HeartbeatService:
    async def _tick(self) -> None:
        """Execute a single heartbeat tick."""
        from nanobot.utils.evaluator import evaluate_response
        content = self._read_heartbeat_file()
        if not content:
            logger.debug("Heartbeat: HEARTBEAT.md missing or empty")
@@ -153,18 +155,19 @@ class HeartbeatService:
                logger.info("Heartbeat: OK (nothing to report)")
                return
            taskmessage = tasks + "\n\n**IMPORTANT NOTICE:** If there is nothing material to report, reply only with <SILENT_OK>."
            logger.info("Heartbeat: tasks found, executing...")
            if self.on_execute:
-                response = await self.on_execute(taskmessage)
+                response = await self.on_execute(tasks)
-                if response and "<SILENT_OK>" in response:
+                if response:
-                    logger.info("Heartbeat: OK (silenced by agent)")
+                    should_notify = await evaluate_response(
-                    return
+                        response, tasks, self.provider, self.model,
-                if response and self.on_notify:
+                    )
-                    logger.info("Heartbeat: completed, delivering response")
+                    if should_notify and self.on_notify:
-                    await self.on_notify(response)
+                        logger.info("Heartbeat: completed, delivering response")
                        await self.on_notify(response)
                    else:
                        logger.info("Heartbeat: silenced by post-run evaluation")
        except Exception:
            logger.exception("Heartbeat execution failed")
--- a/nanobot/utils/evaluator.py
+++ b/nanobot/utils/evaluator.py
@@ -0,0 +1,92 @@
 """Post-run evaluation for background tasks (heartbeat & cron).
 After the agent executes a background task, this module makes a lightweight
 LLM call to decide whether the result warrants notifying the user.
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from loguru import logger
 if TYPE_CHECKING:
    from nanobot.providers.base import LLMProvider
 _EVALUATE_TOOL = [
    {
        "type": "function",
        "function": {
            "name": "evaluate_notification",
            "description": "Decide whether the user should be notified about this background task result.",
            "parameters": {
                "type": "object",
                "properties": {
                    "should_notify": {
                        "type": "boolean",
                        "description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
                    },
                    "reason": {
                        "type": "string",
                        "description": "One-sentence reason for the decision",
                    },
                },
                "required": ["should_notify"],
            },
        },
    }
 ]
 _SYSTEM_PROMPT = (
    "You are a notification gate for a background agent. "
    "You will be given the original task and the agent's response. "
    "Call the evaluate_notification tool to decide whether the user "
    "should be notified.\n\n"
    "Notify when the response contains actionable information, errors, "
    "completed deliverables, or anything the user explicitly asked to "
    "be reminded about.\n\n"
    "Suppress when the response is a routine status check with nothing "
    "new, a confirmation that everything is normal, or essentially empty."
 )
 async def evaluate_response(
    response: str,
    task_context: str,
    provider: LLMProvider,
    model: str,
 ) -> bool:
    """Decide whether a background-task result should be delivered to the user.
    Uses a lightweight tool-call LLM request (same pattern as heartbeat
    ``_decide()``).  Falls back to ``True`` (notify) on any failure so
    that important messages are never silently dropped.
    """
    try:
        llm_response = await provider.chat_with_retry(
            messages=[
                {"role": "system", "content": _SYSTEM_PROMPT},
                {"role": "user", "content": (
                    f"## Original task\n{task_context}\n\n"
                    f"## Agent response\n{response}"
                )},
            ],
            tools=_EVALUATE_TOOL,
            model=model,
            max_tokens=256,
            temperature=0.0,
        )
        if not llm_response.has_tool_calls:
            logger.warning("evaluate_response: no tool call returned, defaulting to notify")
            return True
        args = llm_response.tool_calls[0].arguments
        should_notify = args.get("should_notify", True)
        reason = args.get("reason", "")
        logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
        return bool(should_notify)
    except Exception:
        logger.exception("evaluate_response failed, defaulting to notify")
        return True
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -0,0 +1,63 @@
 import pytest
 from nanobot.utils.evaluator import evaluate_response
 from nanobot.providers.base import LLMProvider, LLMResponse, ToolCallRequest
 class DummyProvider(LLMProvider):
    def __init__(self, responses: list[LLMResponse]):
        super().__init__()
        self._responses = list(responses)
    async def chat(self, *args, **kwargs) -> LLMResponse:
        if self._responses:
            return self._responses.pop(0)
        return LLMResponse(content="", tool_calls=[])
    def get_default_model(self) -> str:
        return "test-model"
 def _eval_tool_call(should_notify: bool, reason: str = "") -> LLMResponse:
    return LLMResponse(
        content="",
        tool_calls=[
            ToolCallRequest(
                id="eval_1",
                name="evaluate_notification",
                arguments={"should_notify": should_notify, "reason": reason},
            )
        ],
    )
@pytest.mark.asyncio
 async def test_should_notify_true() -> None:
    provider = DummyProvider([_eval_tool_call(True, "user asked to be reminded")])
    result = await evaluate_response("Task completed with results", "check emails", provider, "m")
    assert result is True
@pytest.mark.asyncio
 async def test_should_notify_false() -> None:
    provider = DummyProvider([_eval_tool_call(False, "routine check, nothing new")])
    result = await evaluate_response("All clear, no updates", "check status", provider, "m")
    assert result is False
@pytest.mark.asyncio
 async def test_fallback_on_error() -> None:
    class FailingProvider(DummyProvider):
        async def chat(self, *args, **kwargs) -> LLMResponse:
            raise RuntimeError("provider down")
    provider = FailingProvider([])
    result = await evaluate_response("some response", "some task", provider, "m")
    assert result is True
@pytest.mark.asyncio
 async def test_no_tool_call_fallback() -> None:
    provider = DummyProvider([LLMResponse(content="I think you should notify", tool_calls=[])])
    result = await evaluate_response("some response", "some task", provider, "m")
    assert result is True
--- a/tests/test_heartbeat_service.py
+++ b/tests/test_heartbeat_service.py
@@ -123,6 +123,98 @@ async def test_trigger_now_returns_none_when_decision_is_skip(tmp_path) -> None:
    assert await service.trigger_now() is None
@pytest.mark.asyncio
 async def test_tick_notifies_when_evaluator_says_yes(tmp_path, monkeypatch) -> None:
    """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=notify -> on_notify called."""
    (tmp_path / "HEARTBEAT.md").write_text("- [ ] check deployments", encoding="utf-8")
    provider = DummyProvider([
        LLMResponse(
            content="",
            tool_calls=[
                ToolCallRequest(
                    id="hb_1",
                    name="heartbeat",
                    arguments={"action": "run", "tasks": "check deployments"},
                )
            ],
        ),
    ])
    executed: list[str] = []
    notified: list[str] = []
    async def _on_execute(tasks: str) -> str:
        executed.append(tasks)
        return "deployment failed on staging"
    async def _on_notify(response: str) -> None:
        notified.append(response)
    service = HeartbeatService(
        workspace=tmp_path,
        provider=provider,
        model="openai/gpt-4o-mini",
        on_execute=_on_execute,
        on_notify=_on_notify,
    )
    async def _eval_notify(*a, **kw):
        return True
    monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_notify)
    await service._tick()
    assert executed == ["check deployments"]
    assert notified == ["deployment failed on staging"]
@pytest.mark.asyncio
 async def test_tick_suppresses_when_evaluator_says_no(tmp_path, monkeypatch) -> None:
    """Phase 1 run -> Phase 2 execute -> Phase 3 evaluate=silent -> on_notify NOT called."""
    (tmp_path / "HEARTBEAT.md").write_text("- [ ] check status", encoding="utf-8")
    provider = DummyProvider([
        LLMResponse(
            content="",
            tool_calls=[
                ToolCallRequest(
                    id="hb_1",
                    name="heartbeat",
                    arguments={"action": "run", "tasks": "check status"},
                )
            ],
        ),
    ])
    executed: list[str] = []
    notified: list[str] = []
    async def _on_execute(tasks: str) -> str:
        executed.append(tasks)
        return "everything is fine, no issues"
    async def _on_notify(response: str) -> None:
        notified.append(response)
    service = HeartbeatService(
        workspace=tmp_path,
        provider=provider,
        model="openai/gpt-4o-mini",
        on_execute=_on_execute,
        on_notify=_on_notify,
    )
    async def _eval_silent(*a, **kw):
        return False
    monkeypatch.setattr("nanobot.utils.evaluator.evaluate_response", _eval_silent)
    await service._tick()
    assert executed == ["check status"]
    assert notified == []
@pytest.mark.asyncio
 async def test_decide_retries_transient_error_then_succeeds(tmp_path, monkeypatch) -> None:
    provider = DummyProvider([