- Add nanobot/utils/evaluator.py: lightweight LLM tool-call to decide notify/silent after background task execution - Remove magic token injection from heartbeat and cron prompts - Clean session history (no more <SILENT_OK> pollution) - Add tests for evaluator and updated heartbeat three-phase flow
93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
"""Post-run evaluation for background tasks (heartbeat & cron).
|
|
|
|
After the agent executes a background task, this module makes a lightweight
|
|
LLM call to decide whether the result warrants notifying the user.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from loguru import logger
|
|
|
|
if TYPE_CHECKING:
|
|
from nanobot.providers.base import LLMProvider
|
|
|
|
_EVALUATE_TOOL = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "evaluate_notification",
|
|
"description": "Decide whether the user should be notified about this background task result.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"should_notify": {
|
|
"type": "boolean",
|
|
"description": "true = result contains actionable/important info the user should see; false = routine or empty, safe to suppress",
|
|
},
|
|
"reason": {
|
|
"type": "string",
|
|
"description": "One-sentence reason for the decision",
|
|
},
|
|
},
|
|
"required": ["should_notify"],
|
|
},
|
|
},
|
|
}
|
|
]
|
|
|
|
_SYSTEM_PROMPT = (
|
|
"You are a notification gate for a background agent. "
|
|
"You will be given the original task and the agent's response. "
|
|
"Call the evaluate_notification tool to decide whether the user "
|
|
"should be notified.\n\n"
|
|
"Notify when the response contains actionable information, errors, "
|
|
"completed deliverables, or anything the user explicitly asked to "
|
|
"be reminded about.\n\n"
|
|
"Suppress when the response is a routine status check with nothing "
|
|
"new, a confirmation that everything is normal, or essentially empty."
|
|
)
|
|
|
|
|
|
async def evaluate_response(
|
|
response: str,
|
|
task_context: str,
|
|
provider: LLMProvider,
|
|
model: str,
|
|
) -> bool:
|
|
"""Decide whether a background-task result should be delivered to the user.
|
|
|
|
Uses a lightweight tool-call LLM request (same pattern as heartbeat
|
|
``_decide()``). Falls back to ``True`` (notify) on any failure so
|
|
that important messages are never silently dropped.
|
|
"""
|
|
try:
|
|
llm_response = await provider.chat_with_retry(
|
|
messages=[
|
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
{"role": "user", "content": (
|
|
f"## Original task\n{task_context}\n\n"
|
|
f"## Agent response\n{response}"
|
|
)},
|
|
],
|
|
tools=_EVALUATE_TOOL,
|
|
model=model,
|
|
max_tokens=256,
|
|
temperature=0.0,
|
|
)
|
|
|
|
if not llm_response.has_tool_calls:
|
|
logger.warning("evaluate_response: no tool call returned, defaulting to notify")
|
|
return True
|
|
|
|
args = llm_response.tool_calls[0].arguments
|
|
should_notify = args.get("should_notify", True)
|
|
reason = args.get("reason", "")
|
|
logger.info("evaluate_response: should_notify={}, reason={}", should_notify, reason)
|
|
return bool(should_notify)
|
|
|
|
except Exception:
|
|
logger.exception("evaluate_response failed, defaulting to notify")
|
|
return True
|