feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.

2026-06-10 19:43:36 -07:00
parent 46291becd3
commit 39cb9194a5
9 changed files with 1263 additions and 31 deletions
--- a/vibn-frontend/app/api/chat/route.ts
+++ b/vibn-frontend/app/api/chat/route.ts
@@ -28,6 +28,14 @@ import {
  detectKnownError,
  formatRecoveryMessage,
 } from "@/lib/ai/error-recovery";
+import {
+  executeTask,
+  runVerificationContract,
+  type ExecCtx,
+  type ExecuteTaskOutcome,
+  type ToolExecutor,
+  type VerificationTask,
+} from "@/lib/ai/verification";

 // --- Agent Orchestration Types & Constants ---
 type TurnIntent =
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
  );
 }

+// Scan tool results (most-recent first) for a dev-server preview URL so the
+// verification layer can run console/route checks against the running app.
+function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const m = messages[i];
+    if (m.role !== "tool" || typeof m.content !== "string") continue;
+    if (!m.content.includes("preview")) continue;
+    try {
+      const p = JSON.parse(m.content) as Record<string, unknown>;
+      if (typeof p.previewUrl === "string") return p.previewUrl;
+      if (typeof p.stdout === "string") {
+        try {
+          const inner = JSON.parse(p.stdout) as Record<string, unknown>;
+          if (typeof inner.previewUrl === "string") return inner.previewUrl;
+        } catch {
+          /* stdout not JSON */
+        }
+      }
+    } catch {
+      /* not JSON */
+    }
+    const match = m.content.match(
+      /https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
+    );
+    if (match) return match[0];
+  }
+  return undefined;
+}
+
 export async function POST(request: Request) {
  await ensureChatTables();

@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
      let phase: AgentPhase = "recon";
      let checkpointEmitted = false;
      let verificationPassed = false;
+      // When C-08 forces a "Phase Checkpoint" before a mutation, the model's
+      // next reply is that internal planning block. We route it to the
+      // (hidden) thinking channel instead of showing the user a wall of
+      // Goal/Findings/Suspected-Cause text.
+      let suppressNextTextAsCheckpoint = false;

      // ── Server-side conversational guard (C-03 enforcement) ───────────
      // If the user's message looks conversational we withhold tools for
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
      let fileHashes = new Map<string, string>();
      let stallRounds = 0;

+      // Compact corrective executor used by the verification fix-loop: runs up
+      // to `n` model rounds (with tools) to fix whatever verification flagged,
+      // reusing the same tool-execution path as the main loop.
+      async function runFixRounds(n: number) {
+        for (let i = 0; i < n; i++) {
+          if (aborted) break;
+          const fixTools = activeMcpToken
+            ? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
+            : [];
+          const r = await callVibnChat({
+            systemPrompt,
+            messages,
+            tools: fixTools,
+            temperature: 0.4,
+            includeThoughts: true,
+          });
+          if (r.text) {
+            assistantText += (assistantText ? "\n\n" : "") + r.text;
+            assistantTextSegments.push(r.text);
+            emit({ type: "text", text: r.text });
+          }
+          messages.push({
+            role: "assistant",
+            content: r.text,
+            toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
+          });
+          if (!r.toolCalls.length) break;
+          for (const tc of r.toolCalls) {
+            if (aborted) break;
+            assistantToolCalls.push(tc);
+            emit({ type: "tool_start", name: tc.name, args: tc.args });
+            const result = activeMcpToken
+              ? await executeMcpTool(
+                  tc.name,
+                  tc.args,
+                  activeMcpToken,
+                  baseUrl,
+                  activeProject?.id,
+                )
+              : JSON.stringify({ error: "No MCP token" });
+            emit({
+              type: "tool_result",
+              name: tc.name,
+              result: result.slice(0, 500),
+            });
+            messages.push({
+              role: "tool",
+              content: result,
+              toolCallId: tc.id,
+              toolName: tc.name,
+              thoughtSignature: tc.thoughtSignature,
+            });
+          }
+        }
+      }
+
      emit({ type: "phase", phase, label: "Investigating & Planning" });

      try {
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
              findings: "Evaluating...",
            });
            checkpointEmitted = true;
+            suppressNextTextAsCheckpoint = true;
            phase = "execute";
            emit({ type: "phase", phase, label: "Executing Code Edits" });
            continue; // Skip tool execution and re-prompt
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
            return;
          }

-          // Stream user-facing text to client
-          if (resp.text) {
+          // Stream user-facing text to client.
+          // If this round's text is the forced Phase Checkpoint, route it to
+          // the hidden thinking channel and DON'T add it to the user-facing
+          // message (so it never shows live or in the persisted thread).
+          if (resp.text && suppressNextTextAsCheckpoint) {
+            emit({ type: "thinking", text: resp.text });
+            suppressNextTextAsCheckpoint = false;
+          } else if (resp.text) {
            assistantText += (assistantText ? "\n\n" : "") + resp.text;
            assistantTextSegments.push(resp.text);
            emit({ type: "text", text: resp.text });
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
          emit({ type: "aborted" });
        }

+        // ── Acceptance verification + corrective fix-loop (flag-gated) ──
+        // After a turn that mutated code, run the verification contract
+        // (baseline: build + server_up + console_clean). If it fails, feed the
+        // concrete failures back and let the model fix — iterating until green,
+        // stuck, or out of attempts. Off by default; enable per-environment
+        // with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
+        let verificationOutcome: ExecuteTaskOutcome | null = null;
+        const MUTATION_TOOLS = [
+          "fs_write",
+          "fs_edit",
+          "fs_delete",
+          "apps_deploy",
+          "ship",
+        ];
+        const mutated = assistantToolCalls.some((tc) =>
+          MUTATION_TOOLS.includes(tc.name),
+        );
+        if (
+          process.env.VIBN_VERIFICATION_ENABLED === "1" &&
+          !aborted &&
+          mutated &&
+          activeProject?.id &&
+          activeMcpToken
+        ) {
+          emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
+          const previewUrl = extractPreviewUrl(messages);
+          const verifyExec: ToolExecutor = async (name, args) =>
+            executeMcpTool(
+              name,
+              args,
+              activeMcpToken,
+              baseUrl,
+              activeProject!.id,
+            );
+          const vTask: VerificationTask = {
+            id: thread_id,
+            title: message,
+            status: "in_progress",
+            acceptanceChecks: [],
+            attempts: 0,
+          };
+          const verifyCtx: ExecCtx = {
+            projectId: activeProject.id,
+            previewUrl,
+            exec: verifyExec,
+          };
+          try {
+            verificationOutcome = await executeTask(vTask, {
+              maxAttempts: 3,
+              runExecution: async ({ failureFeedback, attempt }) => {
+                // Attempt 1 = verify what the main loop already produced.
+                if (attempt === 1 && !failureFeedback) return;
+                if (failureFeedback)
+                  messages.push({ role: "user", content: failureFeedback });
+                await runFixRounds(2);
+              },
+              verify: async () => runVerificationContract(vTask, verifyCtx),
+            });
+          } catch (e) {
+            console.error("[Verification] errored:", e);
+          }
+          // If verification couldn't reach green, surface the specific failing
+          // checks as an honest status (and let the summary reflect reality).
+          if (verificationOutcome?.status === "blocked") {
+            const checkLines = verificationOutcome.failures
+              .map((f) => `- ${f.check.description}: ${f.evidence}`)
+              .join("\n");
+            const note =
+              `I made the changes but verification didn't fully pass:\n${checkLines}\n` +
+              `That's the honest state — want me to keep working these specific issues?`;
+            assistantText += (assistantText ? "\n\n" : "") + note;
+            assistantTextSegments.push(note);
+            emit({ type: "text", text: note });
+          }
+        }
+
        // If the loop ended with the user staring at a tool tray and no
        // narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
        // detected loop, or the model voluntarily stopped emitting tools
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
            assistantTextSegments.push(fallback);
            emit({ type: "text", text: fallback });
          }
+        } else if (!aborted && anyToolsExecuted) {
+          // Successful tool-using turn — guarantee it ENDS with a clean,
+          // human summary. We only force one when the model didn't already
+          // close with a substantive sentence, so we never pay for a
+          // redundant double-summary.
+          const lastSeg = (
+            assistantTextSegments[assistantTextSegments.length - 1] || ""
+          ).trim();
+          const alreadySummarized =
+            lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
+          if (!alreadySummarized) {
+            try {
+              const finalSummary = await callVibnChat({
+                systemPrompt:
+                  systemPrompt +
+                  `\n\n[FINAL SUMMARY] The work for this turn is finished. In 1–3 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
+                messages,
+                tools: [],
+                temperature: 0.3,
+              });
+              if (finalSummary.text && finalSummary.text.trim()) {
+                assistantText +=
+                  (assistantText ? "\n\n" : "") + finalSummary.text;
+                assistantTextSegments.push(finalSummary.text);
+                emit({ type: "text", text: finalSummary.text });
+              }
+            } catch {
+              // Best-effort: the model's own final text remains as the ending.
+            }
+          }
        }

        // Last-resort guard: the model produced NO user-facing text and NO