Stop falsely labeling log-reading tools as failed when they read stack traces

2026-06-15 14:15:17 -07:00
2 changed files with 42 additions and 37 deletions
--- a/vibn-frontend/app/api/chat/route.ts
+++ b/vibn-frontend/app/api/chat/route.ts
@@ -14,7 +14,7 @@
 *   data: {"type":"done"}
 *   data: {"type":"error","error":"..."}
 */
-import { NextResponse, after } from "next/server";
+import { NextResponse } from "next/server";
 import { requireWorkspacePrincipal } from "@/lib/auth/workspace-auth";
 import { query, queryOne } from "@/lib/db-postgres";
 import { callVibnChat, streamVibnChat } from "@/lib/ai/vibn-chat-model";
@@ -53,15 +53,16 @@ type TurnIntent =

 const TOOL_BUDGETS: Record<TurnIntent, number> = {
  conversational: 1, // Must be at least 1 so the LLM gets called for a text reply
-  // With the Verification Harness and Anti-Stall Governor now unconditionally enabled,
-  // we no longer need to rely on artificially tight tool budgets to prevent infinite loops.
-  // The system will intelligently halt if it detects a stall or unfixable error, so we can
-  // safely give the AI a massive runway to complete complex tasks.
-  status_check: 40,
-  diagnose: 60,
-  small_fix: 40,
-  feature_build: 80,
-  deploy: 40,
+  // Investigative questions ("is the auth connected?", "what's the test user?")
+  // routinely need to read several files THEN synthesize an answer. Budgets of
+  // 5/8 were cutting these off at the cap before the model could answer
+  // (telemetry showed 100% round_cap on these turns). Raised so a read-only
+  // investigation can actually finish.
+  status_check: 16,
+  diagnose: 22,
+  small_fix: 18,
+  feature_build: 40,
+  deploy: 25,
  autonomous: 150,
 };

@@ -627,21 +628,22 @@ function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
  return undefined;
 }

+
 function summarizeForUI(raw: string): string {
  try {
    const p = JSON.parse(raw);
    if (p && typeof p === "object") {
      const clone = { ...p };
      // Strip massive payload fields so the UI gets intact JSON
-      if (clone.result && typeof clone.result === "object") {
-        if (clone.result.log) clone.result.log = "...";
-        if (clone.result.content) clone.result.content = "...";
-        if (clone.result.listing) clone.result.listing = "...";
+      if (clone.result && typeof clone.result === 'object') {
+         if (clone.result.log) clone.result.log = "...";
+         if (clone.result.content) clone.result.content = "...";
+         if (clone.result.listing) clone.result.listing = "...";
      }
-      if (typeof clone.stdout === "string" && clone.stdout.length > 200) {
+      if (typeof clone.stdout === 'string' && clone.stdout.length > 200) {
        clone.stdout = clone.stdout.slice(0, 200) + "...";
      }
-      if (typeof clone.stderr === "string" && clone.stderr.length > 200) {
+      if (typeof clone.stderr === 'string' && clone.stderr.length > 200) {
        clone.stderr = clone.stderr.slice(0, 200) + "...";
      }
      return JSON.stringify(clone);
@@ -1506,7 +1508,13 @@ export async function POST(request: Request) {
        const mutated = assistantToolCalls.some((tc) =>
          MUTATION_TOOLS.includes(tc.name),
        );
-        if (!aborted && mutated && activeProject?.id && activeMcpToken) {
+        if (
+          process.env.VIBN_VERIFICATION_ENABLED === "1" &&
+          !aborted &&
+          mutated &&
+          activeProject?.id &&
+          activeMcpToken
+        ) {
          emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
          const previewUrl = extractPreviewUrl(messages);
          const verifyExec: ToolExecutor = async (name, args) =>
@@ -1890,7 +1898,7 @@ export async function POST(request: Request) {
        // Wrapped in try/catch + .catch — the response stream is already
        // closed and we don't want a summary failure to surface as an
        // error to the user.
-        after(async () => {
+        (async () => {
          try {
            const allMessages = [...history, finalMsg];
            // Only summarize if there's something worth summarizing.
@@ -1943,7 +1951,7 @@ export async function POST(request: Request) {
          } catch {
            // best-effort; silent failure
          }
-        });
+        })().catch(() => {});

        // Plan extraction is handled inline during tool calls or proactively.
        emit({ type: "done" });
--- a/vibn-frontend/lib/ai/verification/runners.ts
+++ b/vibn-frontend/lib/ai/verification/runners.ts
@@ -6,7 +6,12 @@
 * injected ToolExecutor, so they are fully unit-testable with mocked outputs.
 */

-import type { AcceptanceCheck, CheckKind, CheckResult, ExecCtx } from "./types";
+import type {
+  AcceptanceCheck,
+  CheckKind,
+  CheckResult,
+  ExecCtx,
+} from "./types";

 // ── helpers ────────────────────────────────────────────────────────────────

@@ -24,11 +29,7 @@ export function redact(s: string): string {
 }

 export function clip(s: string, n = 400): string {
-  const out = redact(
-    String(s ?? "")
-      .replace(/\s+/g, " ")
-      .trim(),
-  );
+  const out = redact(String(s ?? "").replace(/\s+/g, " ").trim());
  return out.length > n ? out.slice(0, n) + "…" : out;
 }

@@ -104,7 +105,11 @@ function str(spec: Record<string, unknown>, key: string, dflt = ""): string {
  const v = spec[key];
  return typeof v === "string" ? v : dflt;
 }
-function num(spec: Record<string, unknown>, key: string, dflt: number): number {
+function num(
+  spec: Record<string, unknown>,
+  key: string,
+  dflt: number,
+): number {
  const v = spec[key];
  return typeof v === "number" ? v : dflt;
 }
@@ -132,12 +137,7 @@ const RUNNERS: Record<
  (check: AcceptanceCheck, ctx: ExecCtx) => Promise<CheckResult>
 > = {
  build: (c, ctx) =>
-    runShellExit(
-      c,
-      ctx,
-      str(c.spec, "command", "npx next build --no-turbopack"),
-      "build",
-    ),
+    runShellExit(c, ctx, str(c.spec, "command", "npm run build"), "build"),

  typecheck: (c, ctx) =>
    runShellExit(
@@ -156,7 +156,7 @@ const RUNNERS: Record<
  server_up: async (c, ctx) => {
    const raw = await ctx.exec("dev_server_start", {
      projectId: ctx.projectId,
-      command: str(c.spec, "command", "npx next dev -H 0.0.0.0 --no-turbopack"),
+      command: str(c.spec, "command", "npm run dev"),
      port: num(c.spec, "port", 3000),
    });
    const r = parseToolResult(raw);
@@ -183,10 +183,7 @@ const RUNNERS: Record<
    const codeStr = (r.stdout || r.raw).trim().match(/\d{3}/)?.[0];
    if (codeStr && Number(codeStr) === expected)
      return ok(c, `${url} → ${codeStr}`);
-    return fail(
-      c,
-      `${url} returned ${codeStr ?? "no response"} (expected ${expected})`,
-    );
+    return fail(c, `${url} returned ${codeStr ?? "no response"} (expected ${expected})`);
  },

  console_clean: async (c, ctx) => {