feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.

2026-06-10 19:43:36 -07:00
parent fa2021a3a2
commit d4b679bca0
9 changed files with 1263 additions and 31 deletions
--- a/app/api/chat/route.ts
+++ b/app/api/chat/route.ts
@@ -28,6 +28,14 @@ import {
  detectKnownError,
  formatRecoveryMessage,
 } from "@/lib/ai/error-recovery";
+import {
+  executeTask,
+  runVerificationContract,
+  type ExecCtx,
+  type ExecuteTaskOutcome,
+  type ToolExecutor,
+  type VerificationTask,
+} from "@/lib/ai/verification";

 // --- Agent Orchestration Types & Constants ---
 type TurnIntent =
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
  );
 }

+// Scan tool results (most-recent first) for a dev-server preview URL so the
+// verification layer can run console/route checks against the running app.
+function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const m = messages[i];
+    if (m.role !== "tool" || typeof m.content !== "string") continue;
+    if (!m.content.includes("preview")) continue;
+    try {
+      const p = JSON.parse(m.content) as Record<string, unknown>;
+      if (typeof p.previewUrl === "string") return p.previewUrl;
+      if (typeof p.stdout === "string") {
+        try {
+          const inner = JSON.parse(p.stdout) as Record<string, unknown>;
+          if (typeof inner.previewUrl === "string") return inner.previewUrl;
+        } catch {
+          /* stdout not JSON */
+        }
+      }
+    } catch {
+      /* not JSON */
+    }
+    const match = m.content.match(
+      /https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
+    );
+    if (match) return match[0];
+  }
+  return undefined;
+}
+
 export async function POST(request: Request) {
  await ensureChatTables();

@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
      let phase: AgentPhase = "recon";
      let checkpointEmitted = false;
      let verificationPassed = false;
+      // When C-08 forces a "Phase Checkpoint" before a mutation, the model's
+      // next reply is that internal planning block. We route it to the
+      // (hidden) thinking channel instead of showing the user a wall of
+      // Goal/Findings/Suspected-Cause text.
+      let suppressNextTextAsCheckpoint = false;

      // ── Server-side conversational guard (C-03 enforcement) ───────────
      // If the user's message looks conversational we withhold tools for
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
      let fileHashes = new Map<string, string>();
      let stallRounds = 0;

+      // Compact corrective executor used by the verification fix-loop: runs up
+      // to `n` model rounds (with tools) to fix whatever verification flagged,
+      // reusing the same tool-execution path as the main loop.
+      async function runFixRounds(n: number) {
+        for (let i = 0; i < n; i++) {
+          if (aborted) break;
+          const fixTools = activeMcpToken
+            ? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
+            : [];
+          const r = await callVibnChat({
+            systemPrompt,
+            messages,
+            tools: fixTools,
+            temperature: 0.4,
+            includeThoughts: true,
+          });
+          if (r.text) {
+            assistantText += (assistantText ? "\n\n" : "") + r.text;
+            assistantTextSegments.push(r.text);
+            emit({ type: "text", text: r.text });
+          }
+          messages.push({
+            role: "assistant",
+            content: r.text,
+            toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
+          });
+          if (!r.toolCalls.length) break;
+          for (const tc of r.toolCalls) {
+            if (aborted) break;
+            assistantToolCalls.push(tc);
+            emit({ type: "tool_start", name: tc.name, args: tc.args });
+            const result = activeMcpToken
+              ? await executeMcpTool(
+                  tc.name,
+                  tc.args,
+                  activeMcpToken,
+                  baseUrl,
+                  activeProject?.id,
+                )
+              : JSON.stringify({ error: "No MCP token" });
+            emit({
+              type: "tool_result",
+              name: tc.name,
+              result: result.slice(0, 500),
+            });
+            messages.push({
+              role: "tool",
+              content: result,
+              toolCallId: tc.id,
+              toolName: tc.name,
+              thoughtSignature: tc.thoughtSignature,
+            });
+          }
+        }
+      }
+
      emit({ type: "phase", phase, label: "Investigating & Planning" });

      try {
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
              findings: "Evaluating...",
            });
            checkpointEmitted = true;
+            suppressNextTextAsCheckpoint = true;
            phase = "execute";
            emit({ type: "phase", phase, label: "Executing Code Edits" });
            continue; // Skip tool execution and re-prompt
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
            return;
          }

-          // Stream user-facing text to client
-          if (resp.text) {
+          // Stream user-facing text to client.
+          // If this round's text is the forced Phase Checkpoint, route it to
+          // the hidden thinking channel and DON'T add it to the user-facing
+          // message (so it never shows live or in the persisted thread).
+          if (resp.text && suppressNextTextAsCheckpoint) {
+            emit({ type: "thinking", text: resp.text });
+            suppressNextTextAsCheckpoint = false;
+          } else if (resp.text) {
            assistantText += (assistantText ? "\n\n" : "") + resp.text;
            assistantTextSegments.push(resp.text);
            emit({ type: "text", text: resp.text });
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
          emit({ type: "aborted" });
        }

+        // ── Acceptance verification + corrective fix-loop (flag-gated) ──
+        // After a turn that mutated code, run the verification contract
+        // (baseline: build + server_up + console_clean). If it fails, feed the
+        // concrete failures back and let the model fix — iterating until green,
+        // stuck, or out of attempts. Off by default; enable per-environment
+        // with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
+        let verificationOutcome: ExecuteTaskOutcome | null = null;
+        const MUTATION_TOOLS = [
+          "fs_write",
+          "fs_edit",
+          "fs_delete",
+          "apps_deploy",
+          "ship",
+        ];
+        const mutated = assistantToolCalls.some((tc) =>
+          MUTATION_TOOLS.includes(tc.name),
+        );
+        if (
+          process.env.VIBN_VERIFICATION_ENABLED === "1" &&
+          !aborted &&
+          mutated &&
+          activeProject?.id &&
+          activeMcpToken
+        ) {
+          emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
+          const previewUrl = extractPreviewUrl(messages);
+          const verifyExec: ToolExecutor = async (name, args) =>
+            executeMcpTool(
+              name,
+              args,
+              activeMcpToken,
+              baseUrl,
+              activeProject!.id,
+            );
+          const vTask: VerificationTask = {
+            id: thread_id,
+            title: message,
+            status: "in_progress",
+            acceptanceChecks: [],
+            attempts: 0,
+          };
+          const verifyCtx: ExecCtx = {
+            projectId: activeProject.id,
+            previewUrl,
+            exec: verifyExec,
+          };
+          try {
+            verificationOutcome = await executeTask(vTask, {
+              maxAttempts: 3,
+              runExecution: async ({ failureFeedback, attempt }) => {
+                // Attempt 1 = verify what the main loop already produced.
+                if (attempt === 1 && !failureFeedback) return;
+                if (failureFeedback)
+                  messages.push({ role: "user", content: failureFeedback });
+                await runFixRounds(2);
+              },
+              verify: async () => runVerificationContract(vTask, verifyCtx),
+            });
+          } catch (e) {
+            console.error("[Verification] errored:", e);
+          }
+          // If verification couldn't reach green, surface the specific failing
+          // checks as an honest status (and let the summary reflect reality).
+          if (verificationOutcome?.status === "blocked") {
+            const checkLines = verificationOutcome.failures
+              .map((f) => `- ${f.check.description}: ${f.evidence}`)
+              .join("\n");
+            const note =
+              `I made the changes but verification didn't fully pass:\n${checkLines}\n` +
+              `That's the honest state — want me to keep working these specific issues?`;
+            assistantText += (assistantText ? "\n\n" : "") + note;
+            assistantTextSegments.push(note);
+            emit({ type: "text", text: note });
+          }
+        }
+
        // If the loop ended with the user staring at a tool tray and no
        // narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
        // detected loop, or the model voluntarily stopped emitting tools
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
            assistantTextSegments.push(fallback);
            emit({ type: "text", text: fallback });
          }
+        } else if (!aborted && anyToolsExecuted) {
+          // Successful tool-using turn — guarantee it ENDS with a clean,
+          // human summary. We only force one when the model didn't already
+          // close with a substantive sentence, so we never pay for a
+          // redundant double-summary.
+          const lastSeg = (
+            assistantTextSegments[assistantTextSegments.length - 1] || ""
+          ).trim();
+          const alreadySummarized =
+            lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
+          if (!alreadySummarized) {
+            try {
+              const finalSummary = await callVibnChat({
+                systemPrompt:
+                  systemPrompt +
+                  `\n\n[FINAL SUMMARY] The work for this turn is finished. In 1–3 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
+                messages,
+                tools: [],
+                temperature: 0.3,
+              });
+              if (finalSummary.text && finalSummary.text.trim()) {
+                assistantText +=
+                  (assistantText ? "\n\n" : "") + finalSummary.text;
+                assistantTextSegments.push(finalSummary.text);
+                emit({ type: "text", text: finalSummary.text });
+              }
+            } catch {
+              // Best-effort: the model's own final text remains as the ending.
+            }
+          }
        }

        // Last-resort guard: the model produced NO user-facing text and NO
--- a/components/vibn-chat/chat-panel.tsx
+++ b/components/vibn-chat/chat-panel.tsx
@@ -551,9 +551,17 @@ function ThinkingBubble({ thoughts }: { thoughts: string }) {

 function stripRawToolLogs(text: string): string {
  if (!text) return text;
-  return text
-    .replace(/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g, "")
-    .trim();
+  let out = text.replace(
+    /(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g,
+    "",
+  );
+  // Safety net: strip the internal "Phase Checkpoint" planning block
+  // (Goal / Current Findings / Suspected Cause / Verification Plan) if it
+  // ever reaches a user-facing message. This is loop-control machinery, not
+  // something the end user should read. We drop from the heading to the end
+  // of that block (until a blank line followed by non-bulleted prose, or EOF).
+  out = out.replace(/(?:^|\n)\s*#{0,3}\s*Phase Checkpoint[\s\S]*$/i, "").trim();
+  return out.trim();
 }

 const MessageBubble = React.memo(function MessageBubble({
@@ -748,32 +756,8 @@ function Timeline({ entries }: { entries: TimelineEntry[] }) {
          );
        }
        if (item.kind === "checkpoint") {
-          return (
-            <div
-              key={i}
-              style={{
-                margin: "6px 0 12px",
-                padding: "12px 14px",
-                background: "oklch(0.20 0.04 35 / 0.15)",
-                border: "1px dashed var(--accent)",
-                borderRadius: 8,
-                fontSize: "0.75rem",
-                color: "var(--fg-mute)",
-                fontFamily: "var(--font-mono), monospace",
-              }}
-            >
-              <div
-                style={{
-                  color: "var(--accent)",
-                  fontWeight: "bold",
-                  marginBottom: 4,
-                }}
-              >
-                [Checkpoint Logged]
-              </div>
-              <div style={{ opacity: 0.8 }}>{item.goal}</div>
-            </div>
-          );
+          // Internal loop-control machinery — never shown to the user.
+          return null;
        }
        return (
          <TimelineToolGroup
--- a/lib/ai/verification/executor.ts
+++ b/lib/ai/verification/executor.ts
@@ -0,0 +1,127 @@
+/**
+ * Task executor — the iterate-to-green loop.
+ *
+ *   EXECUTE (model edits toward the goal, with prior failures as context)
+ *     → TEST (run the verification contract)
+ *       → pass?  → FINALIZE (task done)
+ *       → fail?  → KEEP FIXING (feed concrete failures back)
+ *       → stuck? → ESCALATE (re-plan or honest blocker to the user)
+ *
+ * This module is pure orchestration over injected dependencies, so the
+ * finalize / keep-fixing / escalate decisions are fully unit-testable without
+ * a live dev container.
+ */
+
+import type { CheckResult, VerificationReport, VerificationTask } from "./types";
+import { failureSignature, formatFailureFeedback } from "./generation";
+
+export interface ExecuteTaskDeps {
+  /**
+   * Run one execution pass: let the model make edits toward the task's goal.
+   * `failureFeedback` is the structured "[VERIFICATION FAILED] …" message from
+   * the previous attempt (empty on the first attempt).
+   */
+  runExecution: (args: {
+    task: VerificationTask;
+    failureFeedback: string;
+    attempt: number;
+  }) => Promise<void>;
+
+  /** Run the verification contract and return a structured report. */
+  verify: (task: VerificationTask) => Promise<VerificationReport>;
+
+  /** Persist task progress (attempts + lastFailures) so a turn can resume. */
+  persist?: (task: VerificationTask) => void | Promise<void>;
+
+  /** Max execute→verify cycles before escalating. Default 5. */
+  maxAttempts?: number;
+
+  /** Stop after this many consecutive no-progress attempts. Default 2. */
+  noProgressLimit?: number;
+}
+
+export type ExecuteTaskOutcome =
+  | { status: "done"; report: VerificationReport; attempts: number }
+  | {
+      status: "blocked";
+      report: VerificationReport | null;
+      attempts: number;
+      reason: string;
+      failures: CheckResult[];
+    };
+
+export async function executeTask(
+  task: VerificationTask,
+  deps: ExecuteTaskDeps,
+): Promise<ExecuteTaskOutcome> {
+  const maxAttempts = deps.maxAttempts ?? 5;
+  const noProgressLimit = deps.noProgressLimit ?? 2;
+
+  task.status = "in_progress";
+  let prevSig: string | null = null;
+  let noProgressStreak = 0;
+  let lastReport: VerificationReport | null = null;
+
+  while (task.attempts < maxAttempts) {
+    task.attempts++;
+
+    // EXECUTE — with the prior failures fed back as concrete instructions.
+    const failureFeedback = task.lastFailures?.length
+      ? formatFailureFeedback(task.lastFailures)
+      : "";
+    await deps.runExecution({
+      task,
+      failureFeedback,
+      attempt: task.attempts,
+    });
+
+    // TEST
+    const report = await deps.verify(task);
+    lastReport = report;
+
+    if (report.passed) {
+      // FINALIZE
+      task.status = "done";
+      task.lastFailures = [];
+      await deps.persist?.(task);
+      return { status: "done", report, attempts: task.attempts };
+    }
+
+    // KEEP FIXING — persist the concrete failures so the next attempt (even in
+    // a later HTTP turn) resumes with full context.
+    task.lastFailures = report.failures;
+    await deps.persist?.(task);
+
+    // Detect no progress: the same hard failures with the same evidence.
+    const sig = failureSignature(report.failures);
+    if (prevSig !== null && sig === prevSig) {
+      noProgressStreak++;
+    } else {
+      noProgressStreak = 0;
+    }
+    prevSig = sig;
+
+    if (noProgressStreak >= noProgressLimit) {
+      task.status = "blocked";
+      await deps.persist?.(task);
+      return {
+        status: "blocked",
+        report,
+        attempts: task.attempts,
+        reason: "no_progress",
+        failures: report.failures,
+      };
+    }
+  }
+
+  // Hit the attempt ceiling without going green.
+  task.status = "blocked";
+  await deps.persist?.(task);
+  return {
+    status: "blocked",
+    report: lastReport,
+    attempts: task.attempts,
+    reason: "max_attempts",
+    failures: lastReport?.failures ?? [],
+  };
+}
--- a/lib/ai/verification/generation.ts
+++ b/lib/ai/verification/generation.ts
@@ -0,0 +1,108 @@
+/**
+ * Acceptance-check generation + feedback formatting.
+ *
+ * - The Planner emits `acceptanceChecks` per task using a strict schema; we
+ *   validate/normalize that output here (models are not trustworthy emitters).
+ * - On a failed verification we format the failures into concrete, structured
+ *   feedback that the next execution round consumes — this is what makes the
+ *   model FIX rather than guess.
+ */
+
+import type { AcceptanceCheck, CheckKind, CheckResult } from "./types";
+
+const VALID_KINDS: CheckKind[] = [
+  "build",
+  "typecheck",
+  "test",
+  "server_up",
+  "route_ok",
+  "console_clean",
+  "content",
+  "flow",
+  "visual",
+  "data",
+];
+
+// Soft-by-default kinds (advisory, never block "done").
+const SOFT_KINDS = new Set<CheckKind>(["visual"]);
+
+/**
+ * Validate and normalize a raw `acceptanceChecks` array from the model.
+ * Drops unknown kinds, coerces missing fields, and caps the count.
+ */
+export function normalizeAcceptanceChecks(raw: unknown): AcceptanceCheck[] {
+  if (!Array.isArray(raw)) return [];
+  const out: AcceptanceCheck[] = [];
+  for (const item of raw) {
+    if (!item || typeof item !== "object") continue;
+    const o = item as Record<string, unknown>;
+    const kind = o.kind as CheckKind;
+    if (!VALID_KINDS.includes(kind)) continue;
+    const spec =
+      o.spec && typeof o.spec === "object"
+        ? (o.spec as Record<string, unknown>)
+        : {};
+    const hard =
+      typeof o.hard === "boolean" ? o.hard : !SOFT_KINDS.has(kind);
+    const description =
+      typeof o.description === "string" && o.description.trim()
+        ? o.description.trim()
+        : kind;
+    out.push({ kind, hard, description, spec });
+    if (out.length >= 3) break; // keep contracts tight (1–3 checks)
+  }
+  return out;
+}
+
+/**
+ * Instruction appended to the Planner's system prompt so each task it creates
+ * carries a checkable contract.
+ */
+export const CHECK_GENERATION_PROMPT = `
+[ACCEPTANCE CHECKS] For every task you create, attach \`acceptanceChecks\`: a JSON
+array of 1–3 checks that objectively prove THIS task is done.
+Each check: { "kind": <kind>, "hard": <bool>, "description": <string>, "spec": { ... } }
+Allowed kinds and their spec:
+- build        spec: {}                                  (compiles)
+- typecheck    spec: {}                                  (no type errors)
+- test         spec: { command?: string }               (tests pass)
+- server_up    spec: { port?: number }                   (app boots, 200)
+- route_ok     spec: { url: string, expectedStatus?: number }
+- console_clean spec: { url?: string }                   (no JS errors)
+- content      spec: { url: string, contains: string }   (text present)
+- flow         spec: { startUrl: string, expectContains: string }
+- visual       spec: { targetPath: string, minScore?: number }   (soft)
+- data         spec: { command: string }                 (records exist)
+Rules:
+- build + server_up + console_clean are added AUTOMATICALLY. Do NOT repeat them.
+- Add only checks that prove THIS task's specific behavior.
+- Prefer the cheapest proof: route_ok/content over flow, flow over visual.
+- If a task is not objectively verifiable (e.g. "make the copy friendlier"),
+  return an empty acceptanceChecks array and set "requiresHumanConfirm": true.
+  Do NOT fabricate a check you cannot actually verify.
+`.trim();
+
+/**
+ * Turn hard failures into specific, actionable feedback for the next execution
+ * round. Not "it didn't work" — the exact check, evidence, and a directive.
+ */
+export function formatFailureFeedback(failures: CheckResult[]): string {
+  if (!failures.length) return "";
+  const lines = failures.map(
+    (f) => `- ${f.check.kind} (${f.check.description}): FAILED — ${f.evidence}`,
+  );
+  return (
+    "[VERIFICATION FAILED] Your last changes did not pass these checks:\n" +
+    lines.join("\n") +
+    "\nFix these specific failures. Do not claim success until every check passes. " +
+    "Address the exact errors above — read the relevant files first if needed."
+  );
+}
+
+/** Stable signature of a report's hard failures — used to detect no-progress. */
+export function failureSignature(failures: CheckResult[]): string {
+  return failures
+    .map((f) => `${f.check.kind}:${f.evidence}`)
+    .sort()
+    .join(";;");
+}
--- a/lib/ai/verification/harness.ts
+++ b/lib/ai/verification/harness.ts
@@ -0,0 +1,92 @@
+/**
+ * Verification harness — runs a task's contract and returns a structured
+ * pass/fail report. This is the single source of truth for "is the task done".
+ */
+
+import type {
+  AcceptanceCheck,
+  CheckResult,
+  ExecCtx,
+  VerificationReport,
+  VerificationTask,
+} from "./types";
+import { runCheck } from "./runners";
+
+/**
+ * The baseline contract auto-attached to every code task. Even if the Planner
+ * specifies no checks, a task can never be "done" while the app fails to build
+ * or the page throws — this is the floor that kills false-completion
+ * ("I scaffolded everything ✓" when nothing compiles).
+ */
+export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
+  const checks: AcceptanceCheck[] = [
+    {
+      kind: "build",
+      hard: true,
+      description: "Project builds without errors",
+      spec: {},
+    },
+    {
+      kind: "server_up",
+      hard: true,
+      description: "Dev server boots and responds 200",
+      spec: { port: 3000 },
+    },
+  ];
+  // console_clean needs a URL to check. Only include it when we actually know
+  // the preview URL — otherwise we'd fail the whole contract on an un-runnable
+  // check. (When run inside the agent, the URL comes from dev_server_start.)
+  if (previewUrl) {
+    checks.push({
+      kind: "console_clean",
+      hard: true,
+      description: "Preview has no runtime console errors",
+      spec: { url: previewUrl },
+    });
+  }
+  return checks;
+}
+
+const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;
+
+/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
+export function withBaseline(
+  checks: AcceptanceCheck[],
+  previewUrl?: string,
+): AcceptanceCheck[] {
+  const seen = new Set(checks.map(KEY));
+  const merged = [...checks];
+  for (const b of baselineChecks(previewUrl)) {
+    if (!seen.has(KEY(b))) merged.push(b);
+  }
+  // Run hard checks first so we short-circuit on the cheapest objective failure.
+  return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
+}
+
+export interface RunContractOptions {
+  /** Skip the auto-baseline (e.g. for a pure data/research task). */
+  noBaseline?: boolean;
+  /** Stop after the first HARD failure (cheaper). Default true. */
+  shortCircuit?: boolean;
+}
+
+export async function runVerificationContract(
+  task: VerificationTask,
+  ctx: ExecCtx,
+  opts: RunContractOptions = {},
+): Promise<VerificationReport> {
+  const { noBaseline = false, shortCircuit = true } = opts;
+  const checks = noBaseline
+    ? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
+    : withBaseline(task.acceptanceChecks, ctx.previewUrl);
+
+  const results: CheckResult[] = [];
+  for (const check of checks) {
+    const r = await runCheck(check, ctx);
+    results.push(r);
+    if (shortCircuit && !r.pass && check.hard) break;
+  }
+
+  const failures = results.filter((r) => !r.pass && r.check.hard);
+  return { passed: failures.length === 0, results, failures };
+}
--- a/lib/ai/verification/index.ts
+++ b/lib/ai/verification/index.ts
@@ -0,0 +1,5 @@
+export * from "./types";
+export * from "./runners";
+export * from "./harness";
+export * from "./generation";
+export * from "./executor";
--- a/lib/ai/verification/runners.ts
+++ b/lib/ai/verification/runners.ts
@@ -0,0 +1,269 @@
+/**
+ * Acceptance check runners.
+ *
+ * Each runner maps a single AcceptanceCheck to a deterministic tool invocation
+ * and returns a structured { pass, evidence }. Runners depend only on the
+ * injected ToolExecutor, so they are fully unit-testable with mocked outputs.
+ */
+
+import type {
+  AcceptanceCheck,
+  CheckKind,
+  CheckResult,
+  ExecCtx,
+} from "./types";
+
+// ── helpers ────────────────────────────────────────────────────────────────
+
+export function redact(s: string): string {
+  return s
+    .replace(
+      /postgres(?:ql)?:\/\/[^:\s]+:[^@\s]+@[^/\s]+\/[^\s"']+/gi,
+      "postgresql://[REDACTED_DB_URL]",
+    )
+    .replace(
+      /eyJ[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}/g,
+      "[REDACTED_JWT]",
+    )
+    .replace(/\b[A-Za-z0-9_-]{40,}\b/g, "[REDACTED_SECRET]");
+}
+
+export function clip(s: string, n = 400): string {
+  const out = redact(String(s ?? "").replace(/\s+/g, " ").trim());
+  return out.length > n ? out.slice(0, n) + "…" : out;
+}
+
+/**
+ * Parse a raw tool result into a normalized shape. Tool results come back as a
+ * JSON string; shapes vary by tool, so we extract defensively. Some tools
+ * double-wrap (a `stdout` field that is itself JSON) — we unwrap one level.
+ */
+export interface ParsedToolResult {
+  code: number | null;
+  stdout: string;
+  stderr: string;
+  status: number | null; // healthCheck.status, etc.
+  raw: string;
+  obj: Record<string, unknown> | null;
+}
+
+export function parseToolResult(raw: string): ParsedToolResult {
+  const base: ParsedToolResult = {
+    code: null,
+    stdout: "",
+    stderr: "",
+    status: null,
+    raw: String(raw ?? ""),
+    obj: null,
+  };
+  let obj: Record<string, unknown> | null = null;
+  try {
+    const p = JSON.parse(base.raw);
+    if (p && typeof p === "object") obj = p as Record<string, unknown>;
+  } catch {
+    return base;
+  }
+  if (!obj) return base;
+  base.obj = obj;
+
+  // Some wrappers nest the real payload under `stdout` as a JSON string.
+  let target = obj;
+  if (
+    typeof obj.stdout === "string" &&
+    obj.code === undefined &&
+    obj.healthCheck === undefined
+  ) {
+    try {
+      const inner = JSON.parse(obj.stdout);
+      if (inner && typeof inner === "object")
+        target = inner as Record<string, unknown>;
+    } catch {
+      /* stdout is plain text, keep outer */
+    }
+  }
+
+  if (typeof target.code === "number") base.code = target.code;
+  if (typeof target.exitCode === "number") base.code = target.exitCode;
+  if (typeof target.stdout === "string") base.stdout = target.stdout;
+  if (typeof target.stderr === "string") base.stderr = target.stderr;
+
+  const hc = target.healthCheck as { status?: number } | undefined;
+  if (hc && typeof hc.status === "number") base.status = hc.status;
+  if (typeof target.status === "number") base.status = target.status;
+
+  return base;
+}
+
+function ok(check: AcceptanceCheck, evidence = "passed"): CheckResult {
+  return { check, pass: true, evidence: clip(evidence) };
+}
+function fail(check: AcceptanceCheck, evidence: string): CheckResult {
+  return { check, pass: false, evidence: clip(evidence) };
+}
+
+function str(spec: Record<string, unknown>, key: string, dflt = ""): string {
+  const v = spec[key];
+  return typeof v === "string" ? v : dflt;
+}
+function num(
+  spec: Record<string, unknown>,
+  key: string,
+  dflt: number,
+): number {
+  const v = spec[key];
+  return typeof v === "number" ? v : dflt;
+}
+
+// ── runners ────────────────────────────────────────────────────────────────
+
+async function runShellExit(
+  check: AcceptanceCheck,
+  ctx: ExecCtx,
+  command: string,
+  label: string,
+): Promise<CheckResult> {
+  const raw = await ctx.exec("shell_exec", {
+    projectId: ctx.projectId,
+    command,
+  });
+  const r = parseToolResult(raw);
+  if (r.code === 0) return ok(check, `${label} passed`);
+  const detail = r.stderr || r.stdout || `exit ${r.code ?? "?"}`;
+  return fail(check, `${label} failed (exit ${r.code ?? "?"}): ${detail}`);
+}
+
+const RUNNERS: Record<
+  CheckKind,
+  (check: AcceptanceCheck, ctx: ExecCtx) => Promise<CheckResult>
+> = {
+  build: (c, ctx) =>
+    runShellExit(c, ctx, str(c.spec, "command", "npm run build"), "build"),
+
+  typecheck: (c, ctx) =>
+    runShellExit(
+      c,
+      ctx,
+      str(c.spec, "command", "npx tsc --noEmit"),
+      "typecheck",
+    ),
+
+  test: (c, ctx) =>
+    runShellExit(c, ctx, str(c.spec, "command", "npm test"), "tests"),
+
+  data: (c, ctx) =>
+    runShellExit(c, ctx, str(c.spec, "command", ""), "data check"),
+
+  server_up: async (c, ctx) => {
+    const raw = await ctx.exec("dev_server_start", {
+      projectId: ctx.projectId,
+      command: str(c.spec, "command", "npm run dev"),
+      port: num(c.spec, "port", 3000),
+    });
+    const r = parseToolResult(raw);
+    if (r.status === 200) return ok(c, "dev server returned 200");
+    return fail(
+      c,
+      `dev server not healthy (status ${r.status ?? "none"}): ${
+        r.stderr || r.stdout || r.raw
+      }`,
+    );
+  },
+
+  route_ok: async (c, ctx) => {
+    const url = str(c.spec, "url");
+    const expected = num(c.spec, "expectedStatus", 200);
+    if (!url) return fail(c, "route_ok check is missing a url");
+    const raw = await ctx.exec("shell_exec", {
+      projectId: ctx.projectId,
+      command: `curl -s -o /dev/null -w "%{http_code}" --max-time 20 ${JSON.stringify(
+        url,
+      )}`,
+    });
+    const r = parseToolResult(raw);
+    const codeStr = (r.stdout || r.raw).trim().match(/\d{3}/)?.[0];
+    if (codeStr && Number(codeStr) === expected)
+      return ok(c, `${url} → ${codeStr}`);
+    return fail(c, `${url} returned ${codeStr ?? "no response"} (expected ${expected})`);
+  },
+
+  console_clean: async (c, ctx) => {
+    const url = str(c.spec, "url", ctx.previewUrl ?? "");
+    if (!url) return fail(c, "console_clean check is missing a url");
+    const raw = await ctx.exec("browser_console", { url });
+    const r = parseToolResult(raw);
+    const text = (r.raw || "").toLowerCase();
+    // Look for error-level console output or framework error overlays.
+    const errorHit =
+      /"type"\s*:\s*"error"/.test(text) ||
+      /\berror\b[^"]{0,40}(overlay|boundary|uncaught|unhandled)/.test(text) ||
+      /failed to compile|module not found|referenceerror|typeerror:/.test(text);
+    if (!errorHit) return ok(c, "no console errors");
+    return fail(c, `console errors on ${url}: ${clip(r.raw, 240)}`);
+  },
+
+  content: async (c, ctx) => {
+    const url = str(c.spec, "url", ctx.previewUrl ?? "");
+    const needle = str(c.spec, "contains");
+    if (!url || !needle)
+      return fail(c, "content check requires both `url` and `contains`");
+    const raw = await ctx.exec("shell_exec", {
+      projectId: ctx.projectId,
+      command: `curl -s --max-time 20 ${JSON.stringify(url)}`,
+    });
+    const r = parseToolResult(raw);
+    const body = r.stdout || r.raw;
+    if (body.includes(needle)) return ok(c, `found "${needle}"`);
+    return fail(c, `"${needle}" not found on ${url}`);
+  },
+
+  flow: async (c, ctx) => {
+    // A basic journey assertion: navigate to startUrl, then assert the page
+    // body contains `expectContains` (or that a follow URL is reachable).
+    const startUrl = str(c.spec, "startUrl", ctx.previewUrl ?? "");
+    const expectContains = str(c.spec, "expectContains");
+    if (!startUrl) return fail(c, "flow check is missing a startUrl");
+    const raw = await ctx.exec("browser_navigate", { url: startUrl });
+    const r = parseToolResult(raw);
+    const body = (r.stdout || r.raw).toString();
+    if (expectContains && !body.includes(expectContains))
+      return fail(c, `flow on ${startUrl}: did not reach "${expectContains}"`);
+    if (/error|cannot|failed/i.test(body) && !expectContains)
+      return fail(c, `flow on ${startUrl} hit an error page`);
+    return ok(c, `flow reached expected state`);
+  },
+
+  visual: async (c, ctx) => {
+    const targetPath = str(c.spec, "targetPath");
+    if (!targetPath) return fail(c, "visual check is missing a targetPath");
+    const raw = await ctx.exec("request_visual_qa", {
+      projectId: ctx.projectId,
+      targetPath,
+    });
+    const r = parseToolResult(raw);
+    const obj = r.obj as { score?: number; passed?: boolean } | null;
+    const threshold = num(c.spec, "minScore", 7);
+    if (obj?.passed === true) return ok(c, "visual QA passed");
+    if (typeof obj?.score === "number")
+      return obj.score >= threshold
+        ? ok(c, `visual QA score ${obj.score}`)
+        : fail(c, `visual QA score ${obj.score} < ${threshold}`);
+    // No structured score — treat as advisory pass (soft checks won't block).
+    return ok(c, "visual QA ran (no numeric score)");
+  },
+};
+
+export async function runCheck(
+  check: AcceptanceCheck,
+  ctx: ExecCtx,
+): Promise<CheckResult> {
+  const runner = RUNNERS[check.kind];
+  if (!runner) return fail(check, `unknown check kind: ${check.kind}`);
+  try {
+    return await runner(check, ctx);
+  } catch (e) {
+    return fail(
+      check,
+      `check runner errored: ${e instanceof Error ? e.message : String(e)}`,
+    );
+  }
+}
--- a/lib/ai/verification/types.ts
+++ b/lib/ai/verification/types.ts
@@ -0,0 +1,71 @@
+/**
+ * Acceptance / Verification layer — types.
+ *
+ * A task is NOT "done" because the model stops calling tools; it is done when
+ * its Verification Contract passes. The contract is a small list of
+ * deterministic, machine-runnable checks attached to the task.
+ */
+
+export type CheckKind =
+  | "build" // code compiles (npm run build)
+  | "typecheck" // no type errors (tsc --noEmit)
+  | "test" // unit/integration tests pass (npm test)
+  | "server_up" // dev server boots and returns 200
+  | "route_ok" // a route/endpoint returns the expected status code
+  | "console_clean" // no runtime JS console errors on a page
+  | "content" // expected text/element present on a page
+  | "flow" // a user journey works (navigate + assert)
+  | "visual" // UI meets a design rubric (request_visual_qa)
+  | "data"; // seed/records exist (a query returns expected rows)
+
+export interface AcceptanceCheck {
+  kind: CheckKind;
+  /** Hard checks gate "done". Soft checks are advisory and never block. */
+  hard: boolean;
+  /** Human-readable description shown in build-health reports. */
+  description: string;
+  /** Kind-specific parameters (command, url, expectedStatus, etc.). */
+  spec: Record<string, unknown>;
+}
+
+export interface CheckResult {
+  check: AcceptanceCheck;
+  pass: boolean;
+  /** Redacted, truncated evidence — fed back to the model on failure. */
+  evidence: string;
+}
+
+export interface VerificationReport {
+  /** True only when every HARD check passed. */
+  passed: boolean;
+  results: CheckResult[];
+  /** Hard failures only — these are what the model must fix. */
+  failures: CheckResult[];
+}
+
+export interface VerificationTask {
+  id: string;
+  title: string;
+  status: "open" | "in_progress" | "done" | "blocked";
+  acceptanceChecks: AcceptanceCheck[];
+  attempts: number;
+  lastFailures?: CheckResult[];
+  /** Tasks that can't be objectively verified (e.g. "make copy friendlier"). */
+  requiresHumanConfirm?: boolean;
+}
+
+/**
+ * Abstraction over the agent's tool execution. Returns the raw tool result
+ * string (usually JSON). Injecting this makes every runner unit-testable.
+ */
+export type ToolExecutor = (
+  name: string,
+  args: Record<string, unknown>,
+) => Promise<string>;
+
+export interface ExecCtx {
+  projectId: string;
+  /** Preview URL of the running dev server, when known. */
+  previewUrl?: string;
+  exec: ToolExecutor;
+}
--- a/lib/ai/verification/verification.test.ts
+++ b/lib/ai/verification/verification.test.ts
@@ -0,0 +1,365 @@
+import { describe, it, expect, vi } from "vitest";
+import { parseToolResult, runCheck, clip, redact } from "./runners";
+import { withBaseline, runVerificationContract } from "./harness";
+import {
+  normalizeAcceptanceChecks,
+  formatFailureFeedback,
+  failureSignature,
+} from "./generation";
+import { executeTask } from "./executor";
+import type {
+  AcceptanceCheck,
+  ExecCtx,
+  ToolExecutor,
+  VerificationReport,
+  VerificationTask,
+} from "./types";
+
+// A mock tool executor: maps a tool name to a canned raw result string.
+function mockExec(map: Record<string, string>): ToolExecutor {
+  return async (name: string) => map[name] ?? "{}";
+}
+function ctx(map: Record<string, string>, previewUrl?: string): ExecCtx {
+  return { projectId: "p1", previewUrl, exec: mockExec(map) };
+}
+const check = (
+  kind: AcceptanceCheck["kind"],
+  spec: Record<string, unknown> = {},
+  hard = true,
+): AcceptanceCheck => ({ kind, hard, description: kind, spec });
+
+const task = (checks: AcceptanceCheck[]): VerificationTask => ({
+  id: "t1",
+  title: "Test task",
+  status: "open",
+  acceptanceChecks: checks,
+  attempts: 0,
+});
+
+// ── parsing ──────────────────────────────────────────────────────────────
+
+describe("parseToolResult", () => {
+  it("extracts code from a shell result", () => {
+    const r = parseToolResult(
+      JSON.stringify({ code: 1, stdout: "", stderr: "boom" }),
+    );
+    expect(r.code).toBe(1);
+    expect(r.stderr).toBe("boom");
+  });
+  it("unwraps a double-nested stdout JSON payload", () => {
+    const r = parseToolResult(
+      JSON.stringify({ stdout: JSON.stringify({ code: 0, stdout: "ok" }) }),
+    );
+    expect(r.code).toBe(0);
+    expect(r.stdout).toBe("ok");
+  });
+  it("reads healthCheck.status for server checks", () => {
+    const r = parseToolResult(
+      JSON.stringify({ previewUrl: "x", healthCheck: { status: 200 } }),
+    );
+    expect(r.status).toBe(200);
+  });
+  it("survives non-JSON", () => {
+    const r = parseToolResult("not json");
+    expect(r.code).toBeNull();
+    expect(r.raw).toBe("not json");
+  });
+});
+
+describe("redaction", () => {
+  it("redacts db urls and jwts and long secrets", () => {
+    const s = redact(
+      "db postgresql://u:p4ssword@host:5432/mydb token eyJhbGciOiJIUzI1.eyJzdWIiOjEy.SflKxwRJSMeKKF secret sk_live_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789xyz",
+    );
+    expect(s).toContain("[REDACTED_DB_URL]");
+    expect(s).toContain("[REDACTED_JWT]");
+    expect(s).toContain("[REDACTED_SECRET]");
+  });
+  it("clip truncates and trims long non-secret text", () => {
+    const long = "the quick brown fox jumps over the lazy dog. ".repeat(20);
+    expect(clip(long).endsWith("…")).toBe(true);
+    expect(clip(long).length).toBeLessThanOrEqual(401);
+  });
+});
+
+// ── runners ──────────────────────────────────────────────────────────────
+
+describe("runners", () => {
+  it("build passes on exit 0, fails on non-zero with stderr", async () => {
+    const pass = await runCheck(
+      check("build"),
+      ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "done" }) }),
+    );
+    expect(pass.pass).toBe(true);
+
+    const failR = await runCheck(
+      check("build"),
+      ctx({
+        shell_exec: JSON.stringify({
+          code: 1,
+          stderr: "Type error on auth.ts:14",
+        }),
+      }),
+    );
+    expect(failR.pass).toBe(false);
+    expect(failR.evidence).toContain("auth.ts:14");
+  });
+
+  it("server_up passes on 200, fails otherwise", async () => {
+    const pass = await runCheck(
+      check("server_up"),
+      ctx({
+        dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
+      }),
+    );
+    expect(pass.pass).toBe(true);
+
+    const failR = await runCheck(
+      check("server_up"),
+      ctx({
+        dev_server_start: JSON.stringify({ healthCheck: { status: 502 } }),
+      }),
+    );
+    expect(failR.pass).toBe(false);
+  });
+
+  it("route_ok matches the expected status code", async () => {
+    const pass = await runCheck(
+      check("route_ok", { url: "http://x/dashboard", expectedStatus: 200 }),
+      ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "200" }) }),
+    );
+    expect(pass.pass).toBe(true);
+
+    const failR = await runCheck(
+      check("route_ok", { url: "http://x/dashboard" }),
+      ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "404" }) }),
+    );
+    expect(failR.pass).toBe(false);
+    expect(failR.evidence).toContain("404");
+  });
+
+  it("console_clean fails when an error is present", async () => {
+    const failR = await runCheck(
+      check("console_clean", { url: "http://x" }),
+      ctx({
+        browser_console: JSON.stringify([{ type: "error", text: "boom" }]),
+      }),
+    );
+    expect(failR.pass).toBe(false);
+
+    const pass = await runCheck(
+      check("console_clean", { url: "http://x" }),
+      ctx({ browser_console: JSON.stringify([{ type: "log", text: "ok" }]) }),
+    );
+    expect(pass.pass).toBe(true);
+  });
+
+  it("content checks for a substring", async () => {
+    const pass = await runCheck(
+      check("content", { url: "http://x", contains: "GetAcquired" }),
+      ctx({
+        shell_exec: JSON.stringify({ code: 0, stdout: "<h1>GetAcquired</h1>" }),
+      }),
+    );
+    expect(pass.pass).toBe(true);
+  });
+});
+
+// ── harness ──────────────────────────────────────────────────────────────
+
+describe("harness", () => {
+  it("auto-attaches the baseline contract", () => {
+    const merged = withBaseline([], "http://preview");
+    const kinds = merged.map((c) => c.kind).sort();
+    expect(kinds).toContain("build");
+    expect(kinds).toContain("server_up");
+    expect(kinds).toContain("console_clean");
+  });
+
+  it("does not duplicate a baseline check the planner already specified", () => {
+    const merged = withBaseline([check("build")]);
+    expect(merged.filter((c) => c.kind === "build").length).toBe(1);
+  });
+
+  it("reports passed only when all hard checks pass", async () => {
+    const report = await runVerificationContract(
+      task([check("route_ok", { url: "http://x/d" })]),
+      ctx({
+        shell_exec: JSON.stringify({ code: 0, stdout: "200" }),
+        dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
+        browser_console: JSON.stringify([]),
+      }),
+      { shortCircuit: false },
+    );
+    expect(report.passed).toBe(true);
+    expect(report.failures.length).toBe(0);
+  });
+
+  it("short-circuits on the first hard failure", async () => {
+    const exec = vi.fn(async (name: string) => {
+      if (name === "shell_exec")
+        return JSON.stringify({ code: 1, stderr: "build broke" });
+      return "{}";
+    });
+    const report = await runVerificationContract(
+      task([]),
+      { projectId: "p1", exec },
+      { shortCircuit: true },
+    );
+    expect(report.passed).toBe(false);
+    // build is the first hard check; we should NOT have called dev_server_start.
+    expect(exec).toHaveBeenCalledWith("shell_exec", expect.anything());
+    expect(exec).not.toHaveBeenCalledWith(
+      "dev_server_start",
+      expect.anything(),
+    );
+  });
+
+  it("soft check failure does NOT block done", async () => {
+    const report = await runVerificationContract(
+      task([check("visual", { targetPath: "x" }, false)]),
+      ctx({
+        shell_exec: JSON.stringify({ code: 0 }),
+        dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
+        browser_console: JSON.stringify([]),
+        request_visual_qa: JSON.stringify({ score: 2 }),
+      }),
+      { shortCircuit: false },
+    );
+    // visual scored 2 (would fail) but it's soft → does not block.
+    expect(report.passed).toBe(true);
+  });
+});
+
+// ── generation ───────────────────────────────────────────────────────────
+
+describe("generation", () => {
+  it("normalizes and caps acceptance checks, dropping unknown kinds", () => {
+    const out = normalizeAcceptanceChecks([
+      { kind: "route_ok", spec: { url: "x" } },
+      { kind: "bogus" },
+      { kind: "content", spec: { url: "x", contains: "y" } },
+      { kind: "build" },
+      { kind: "data", spec: { command: "q" } },
+    ]);
+    expect(out.length).toBe(3); // capped
+    expect(out.find((c) => c.kind === ("bogus" as never))).toBeUndefined();
+  });
+
+  it("defaults visual to a soft check", () => {
+    const out = normalizeAcceptanceChecks([{ kind: "visual", spec: {} }]);
+    expect(out[0].hard).toBe(false);
+  });
+
+  it("formats actionable failure feedback", () => {
+    const fb = formatFailureFeedback([
+      {
+        check: check("build"),
+        pass: false,
+        evidence: "Cannot find name foo (auth.ts:14)",
+      },
+    ]);
+    expect(fb).toContain("[VERIFICATION FAILED]");
+    expect(fb).toContain("auth.ts:14");
+    expect(fb).toContain("Do not claim success");
+  });
+
+  it("failure signatures are stable and order-independent", () => {
+    const a = failureSignature([
+      { check: check("build"), pass: false, evidence: "x" },
+      { check: check("route_ok"), pass: false, evidence: "y" },
+    ]);
+    const b = failureSignature([
+      { check: check("route_ok"), pass: false, evidence: "y" },
+      { check: check("build"), pass: false, evidence: "x" },
+    ]);
+    expect(a).toBe(b);
+  });
+});
+
+// ── executor fix-loop ────────────────────────────────────────────────────
+
+const passReport = (): VerificationReport => ({
+  passed: true,
+  results: [],
+  failures: [],
+});
+const failReport = (evidence: string): VerificationReport => ({
+  passed: false,
+  results: [],
+  failures: [{ check: check("build"), pass: false, evidence }],
+});
+
+describe("executeTask fix-loop", () => {
+  it("FINALIZES immediately when the first verify passes", async () => {
+    const runExecution = vi.fn(async () => {});
+    const verify = vi.fn(async () => passReport());
+    const out = await executeTask(task([]), { runExecution, verify });
+    expect(out.status).toBe("done");
+    expect(out.attempts).toBe(1);
+    expect(runExecution).toHaveBeenCalledTimes(1);
+  });
+
+  it("KEEPS FIXING then finalizes when a later attempt passes", async () => {
+    const verify = vi
+      .fn()
+      .mockResolvedValueOnce(failReport("err A"))
+      .mockResolvedValueOnce(failReport("err B")) // different evidence = progress
+      .mockResolvedValueOnce(passReport());
+    const feedbacks: string[] = [];
+    const runExecution = vi.fn(async (a: { failureFeedback: string }) => {
+      feedbacks.push(a.failureFeedback);
+    });
+    const out = await executeTask(task([]), { runExecution, verify });
+    expect(out.status).toBe("done");
+    expect(out.attempts).toBe(3);
+    // The 2nd execution received the 1st attempt's concrete failure as context.
+    expect(feedbacks[1]).toContain("err A");
+  });
+
+  it("ESCALATES (blocked: no_progress) when the same failure repeats", async () => {
+    const verify = vi.fn(async () => failReport("same error"));
+    const runExecution = vi.fn(async () => {});
+    const out = await executeTask(task([]), {
+      runExecution,
+      verify,
+      noProgressLimit: 2,
+    });
+    expect(out.status).toBe("blocked");
+    if (out.status === "blocked") expect(out.reason).toBe("no_progress");
+  });
+
+  it("ESCALATES (blocked: max_attempts) if it never goes green but keeps changing", async () => {
+    let n = 0;
+    const verify = vi.fn(async () => failReport(`err ${n++}`)); // always different
+    const runExecution = vi.fn(async () => {});
+    const out = await executeTask(task([]), {
+      runExecution,
+      verify,
+      maxAttempts: 3,
+    });
+    expect(out.status).toBe("blocked");
+    if (out.status === "blocked") {
+      expect(out.reason).toBe("max_attempts");
+      expect(out.attempts).toBe(3);
+    }
+  });
+
+  it("persists progress on every attempt (resume support)", async () => {
+    const verify = vi
+      .fn()
+      .mockResolvedValueOnce(failReport("e1"))
+      .mockResolvedValueOnce(passReport());
+    const persisted: number[] = [];
+    const t = task([]);
+    await executeTask(t, {
+      runExecution: async () => {},
+      verify,
+      persist: (tk) => {
+        persisted.push(tk.attempts);
+      },
+    });
+    expect(persisted).toContain(1); // persisted the failing attempt
+    expect(t.status).toBe("done");
+  });
+});