feat(telemetry): implement phase-based execution loop and adaptive tool budgets

2026-06-09 18:58:12 -07:00
parent d4c10db58e
commit ca47d0643d
3 changed files with 262 additions and 7 deletions
--- a/VIBN_ORCHESTRATION_LOOP.md
+++ b/VIBN_ORCHESTRATION_LOOP.md
@@ -0,0 +1,43 @@
 # VIBN Agent Orchestration Loop & State Governor
 This document outlines the Phase-Based Execution Loop architecture that governs all autonomous agent runs in the Vibn workspace.
 ## 1. Adaptive Tool Budgets (Intent Classification)
 The global `MAX_TOOL_ROUNDS = 150` is a necessary safety net, but allowing a simple "why is the preview blank?" query to run 150 tools is a UX failure. 
 When a user prompt is received, we classify its intent and assign a strict tool budget:
 * **`conversational`** (Budget: 0) — Greetings, affirmations.
 * **`status_check`** (Budget: 2) — "What is running?", "Show me the logs."
 * **`diagnose`** (Budget: 8) — "Why is the preview blank?", "The build failed."
 * **`small_fix`** (Budget: 15) — "Change the header color", "Fix the typo."
 * **`feature_build`** (Budget: 40) — "Add a pricing page", "Wire up Stripe."
 * **`autonomous`** (Budget: 150) — "Build this entire app from scratch", "Keep going."
 ## 2. Phase-Based Execution State Machine
 An agent turn no longer has access to all tools at all times. It transitions through a strict state machine:
 1. **`recon`**: Gathering context. Only non-mutating tools allowed (`fs_read`, `dev_server_logs`, `browser_console`).
 2. **`checkpoint`**: A mandatory pause where the agent must state its findings, goal, and proposed action *before* it is granted write access.
 3. **`execute`**: Mutating tools unlocked (`fs_edit`, `shell_exec`, `dev_server_start`).
 4. **`verify`**: Post-mutation testing. The agent must successfully run a compilation check or visual QA before claiming success.
 5. **`final`**: Synthesis and user response.
 ## 3. Tool Classification & Filtering
 Tools in `lib/ai/vibn-tools.ts` are heavily categorized:
 * **Read-Only**: `fs_read`, `fs_list`, `fs_grep`, `dev_server_list`, `dev_server_logs`, `projects_get`
 * **Mutating**: `fs_write`, `fs_edit`, `fs_delete`, `shell_exec`
 * **Verification**: `browser_console`, `request_visual_qa`
 If an agent in the `recon` phase attempts a mutating tool, the loop intercepts the call, blocks execution, and injects a recovery prompt demanding a Checkpoint first.
 ## 4. Forced Verification Gates
 Before the loop can naturally terminate and present the "Done" state to the user, the governor checks:
 * Did the agent mutate files (`fs_write`, `fs_edit`)?
 * If yes, did the agent run `browser_console` or `dev_server_start` after the last edit?
 * If no, the final response is rejected and a system prompt forces the agent to verify the build before concluding.
 ## 5. UI Event Telemetry
 The backend streams rich SSE events to the frontend Chat Panel:
 * `data: {"type": "phase", "phase": "recon", "label": "Investigating Codebase"}`
 * `data: {"type": "checkpoint", "goal": "...", "findings": "..."}`
 * `data: {"type": "budget", "used": 5, "limit": 15}`
 This replaces the "silent black box" with an engaging, highly transparent glass-box UI.
--- a/vibn-frontend/app/api/chat/route.ts
+++ b/vibn-frontend/app/api/chat/route.ts
@@ -18,11 +18,86 @@ import { NextResponse } from "next/server";
 import { requireWorkspacePrincipal } from "@/lib/auth/workspace-auth";
 import { query, queryOne } from "@/lib/db-postgres";
 import { callVibnChat } from "@/lib/ai/vibn-chat-model";
-import { VIBN_TOOL_DEFINITIONS, executeMcpTool } from "@/lib/ai/vibn-tools";
+import {
  VIBN_TOOL_DEFINITIONS,
  executeMcpTool,
  filterToolsForPhase,
  type AgentPhase,
 } from "@/lib/ai/vibn-tools";
 import {
  detectKnownError,
  formatRecoveryMessage,
 } from "@/lib/ai/error-recovery";
 // --- Agent Orchestration Types & Constants ---
 type TurnIntent =
  | "conversational"
  | "status_check"
  | "diagnose"
  | "small_fix"
  | "feature_build"
  | "deploy"
  | "autonomous";
 type AgentPhase =
  | "plan"
  | "recon"
  | "checkpoint"
  | "execute"
  | "verify"
  | "final";
 const TOOL_BUDGETS: Record<TurnIntent, number> = {
  conversational: 0,
  status_check: 2,
  diagnose: 8,
  small_fix: 18,
  feature_build: 40,
  deploy: 25,
  autonomous: 150,
 };
 function classifyTurnIntent(message: string): TurnIntent {
  const m = message.trim().toLowerCase();
  // High-agency directives
  if (
    /(keep going|continue|build it|do it|go ahead|proceed|autonomous)/.test(m)
  )
    return "autonomous";
  // Deployments
  if (/(deploy|ship|release|publish|push to prod)/.test(m)) return "deploy";
  // Feature build
  if (
    /(build|create|add|implement|make a|setup|wire up|scaffold|integrate)/.test(
      m,
    )
  ) {
    if (m.length > 50) return "feature_build";
    return "small_fix";
  }
  // Diagnostics
  if (
    /(why|broken|error|blank|not loading|fail|bug|issue|doesn't work|isn't working|fix)/.test(
      m,
    )
  )
    return "diagnose";
  // Status check
  if (/(status|logs|running|active|what is|show me|check)/.test(m))
    return "status_check";
  // Conversational fallback
  if (m.length < 20 || /^(hi|hello|thanks|ok|yes|no)/.test(m))
    return "conversational";
  // Default to a generous feature build if we can't tell
  return "feature_build";
 }
 import { listRecentSentryIssues } from "@/lib/integrations/sentry";
 import {
  ensureProjectRepoCloned,
@@ -837,17 +912,28 @@ export async function POST(request: Request) {
      let fileHashes = new Map<string, string>();
      let stallRounds = 0;
      // ── Phase & Intent State ──
      const turnIntent = classifyTurnIntent(message);
      const maxToolRounds = activeMcpToken ? TOOL_BUDGETS[turnIntent] : 0;
      let phase: AgentPhase = "recon";
      let checkpointEmitted = false;
      let verificationPassed = false;
      emit({ type: "phase", phase, label: "Investigating & Planning" });
      try {
        // Tool-calling loop: use non-streaming so thought_signature is
        // always present in the complete response (required by thinking models).
-        while (round < MAX_TOOL_ROUNDS) {
+        while (round < maxToolRounds) {
          if (aborted) break;
          round++;
          // Keep tool definitions active in the schema to avoid model confusion and
          // MALFORMED_FUNCTION_CALL gateway crashes, but let our system instructions
          // guide the model to respond in plain text for conversational inputs.
-          const toolDefs = activeMcpToken ? VIBN_TOOL_DEFINITIONS : [];
+          const toolDefs = activeMcpToken
            ? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, phase, turnIntent)
            : [];
          // Every 6 silent rounds or 8 tool calls, gently nudge the model to surface a one-liner
          // status before continuing. This is the user's only signal of
@@ -871,10 +957,57 @@ export async function POST(request: Request) {
              "If they want you to take action, confirm intent and wait for a clear directive.";
          }
-          if (MAX_TOOL_ROUNDS - round <= 3) {
+          if (maxToolRounds - round <= 3) {
-            extraSystem += `\n\n[WARNING] You only have ${MAX_TOOL_ROUNDS - round} tool calls left before you are forcefully terminated. Stop exploring, make your final edits, and write your final response to the user NOW.`;
+            extraSystem += `\n\n[WARNING] You only have ${maxToolRounds - round} tool calls left before you are forcefully terminated. Stop exploring, make your final edits, and write your final response to the user NOW.`;
          }
          // C-08: Force Checkpoint Before Mutation
          // If the AI is trying to use mutating tools but hasn't emitted a checkpoint,
          // intercept the tool calls, block them, and force it to state its plan.
          const requestedMutations = resp.toolCalls.filter((tc) =>
            [
              "fs_write",
              "fs_edit",
              "fs_delete",
              "dev_server_start",
              "dev_server_stop",
              "apps_deploy",
              "ship",
            ].includes(tc.name),
          );
          if (
            requestedMutations.length > 0 &&
            !checkpointEmitted &&
            phase === "recon"
          ) {
            const blockMsg =
              "[PHASE CHECKPOINT REQUIRED] Before editing files or deploying, you MUST state your goal, current findings, the suspected cause of the issue, the exact file(s) to change, and your verification plan. Do not call any tools in your response.";
            messages.push({
              role: "user",
              content: blockMsg,
            });
            emit({
              type: "checkpoint",
              goal: "Awaiting checkpoint...",
              findings: "Evaluating...",
            });
            checkpointEmitted = true;
            phase = "execute";
            emit({ type: "phase", phase, label: "Executing Code Edits" });
            continue; // Skip tool execution and re-prompt
          }
          if (requestedMutations.length > 0) {
            phase = "verify";
            emit({
              type: "phase",
              phase,
              label: "Verifying Build & Compiling",
            });
          }
          // Execute tool calls and add results. OpenAI-compatible APIs
          const resp = await callVibnChat({
            systemPrompt: systemPrompt + extraSystem,
            messages,
@@ -1120,7 +1253,7 @@ export async function POST(request: Request) {
        const needsRecovery =
          !aborted &&
          anyToolsExecuted &&
-          (round >= MAX_TOOL_ROUNDS ||
+          (round >= maxToolRounds ||
            !!loopBreakReason ||
            assistantText.trim().length === 0 ||
            roundsSinceText >= 30 ||
@@ -1133,7 +1266,7 @@ export async function POST(request: Request) {
            : "";
          const reason = loopBreakReason
            ? `LOOP DETECTED: ${loopBreakReason}. Stop trying that approach. `
-            : round >= MAX_TOOL_ROUNDS
+            : round >= maxToolRounds
              ? "You hit the tool-round cap. "
              : "";
          try {
--- a/vibn-frontend/lib/ai/vibn-tools.ts
+++ b/vibn-frontend/lib/ai/vibn-tools.ts
@@ -12,6 +12,85 @@ import type { ToolDefinition } from "./gemini-chat";
 const GITHUB_TOKEN = process.env.GITHUB_TOKEN || "";
 export type AgentPhase =
  | "plan"
  | "recon"
  | "checkpoint"
  | "execute"
  | "verify"
  | "final";
 export type TurnIntent =
  | "conversational"
  | "status_check"
  | "diagnose"
  | "small_fix"
  | "feature_build"
  | "deploy"
  | "autonomous";
 const READ_ONLY_TOOLS = new Set([
  "projects_get",
  "projects_list",
  "workspace_describe",
  "apps_list",
  "apps_get",
  "apps_logs",
  "dev_server_list",
  "dev_server_logs",
  "browser_console",
  "fs_read",
  "fs_list",
  "fs_tree",
  "fs_glob",
  "fs_grep",
  "gitea_credentials",
  "plan_get",
  "shell_exec", // Safe-listed with prompt constraints
 ]);
 const MUTATING_TOOLS = new Set([
  "fs_write",
  "fs_edit",
  "fs_delete",
  "dev_server_start",
  "dev_server_stop",
  "devcontainer_ensure",
  "apps_create",
  "apps_update",
  "apps_deploy",
  "apps_delete",
  "apps_envs_upsert",
  "apps_envs_delete",
  "apps_domains_set",
  "databases_create",
  "domains_register",
  "ship",
  "plan_task_add",
  "plan_task_edit",
  "plan_task_complete",
  "plan_vision_set",
 ]);
 export function filterToolsForPhase(
  tools: ToolDefinition[],
  phase: AgentPhase,
  intent: TurnIntent,
 ): ToolDefinition[] {
  if (phase === "recon" || phase === "verify") {
    return tools.filter(
      (t) =>
        READ_ONLY_TOOLS.has(t.name) ||
        t.name === "request_visual_qa" ||
        t.name === "browser_navigate",
    );
  }
  if (phase === "execute") {
    return tools; // All tools allowed
  }
  return tools; // Default fallback
 }
 export const VIBN_TOOL_DEFINITIONS: ToolDefinition[] = [
  // ── Workspace & identity ─────────────────────────────────────────────────