feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.

This commit is contained in:
2026-06-10 19:43:36 -07:00
parent 46291becd3
commit 39cb9194a5
9 changed files with 1263 additions and 31 deletions

View File

@@ -28,6 +28,14 @@ import {
detectKnownError,
formatRecoveryMessage,
} from "@/lib/ai/error-recovery";
import {
executeTask,
runVerificationContract,
type ExecCtx,
type ExecuteTaskOutcome,
type ToolExecutor,
type VerificationTask,
} from "@/lib/ai/verification";
// --- Agent Orchestration Types & Constants ---
type TurnIntent =
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
);
}
// Scan tool results (most-recent first) for a dev-server preview URL so the
// verification layer can run console/route checks against the running app.
function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
for (let i = messages.length - 1; i >= 0; i--) {
const m = messages[i];
if (m.role !== "tool" || typeof m.content !== "string") continue;
if (!m.content.includes("preview")) continue;
try {
const p = JSON.parse(m.content) as Record<string, unknown>;
if (typeof p.previewUrl === "string") return p.previewUrl;
if (typeof p.stdout === "string") {
try {
const inner = JSON.parse(p.stdout) as Record<string, unknown>;
if (typeof inner.previewUrl === "string") return inner.previewUrl;
} catch {
/* stdout not JSON */
}
}
} catch {
/* not JSON */
}
const match = m.content.match(
/https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
);
if (match) return match[0];
}
return undefined;
}
export async function POST(request: Request) {
await ensureChatTables();
@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
let phase: AgentPhase = "recon";
let checkpointEmitted = false;
let verificationPassed = false;
// When C-08 forces a "Phase Checkpoint" before a mutation, the model's
// next reply is that internal planning block. We route it to the
// (hidden) thinking channel instead of showing the user a wall of
// Goal/Findings/Suspected-Cause text.
let suppressNextTextAsCheckpoint = false;
// ── Server-side conversational guard (C-03 enforcement) ───────────
// If the user's message looks conversational we withhold tools for
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
let fileHashes = new Map<string, string>();
let stallRounds = 0;
// Compact corrective executor used by the verification fix-loop: runs up
// to `n` model rounds (with tools) to fix whatever verification flagged,
// reusing the same tool-execution path as the main loop.
async function runFixRounds(n: number) {
for (let i = 0; i < n; i++) {
if (aborted) break;
const fixTools = activeMcpToken
? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
: [];
const r = await callVibnChat({
systemPrompt,
messages,
tools: fixTools,
temperature: 0.4,
includeThoughts: true,
});
if (r.text) {
assistantText += (assistantText ? "\n\n" : "") + r.text;
assistantTextSegments.push(r.text);
emit({ type: "text", text: r.text });
}
messages.push({
role: "assistant",
content: r.text,
toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
});
if (!r.toolCalls.length) break;
for (const tc of r.toolCalls) {
if (aborted) break;
assistantToolCalls.push(tc);
emit({ type: "tool_start", name: tc.name, args: tc.args });
const result = activeMcpToken
? await executeMcpTool(
tc.name,
tc.args,
activeMcpToken,
baseUrl,
activeProject?.id,
)
: JSON.stringify({ error: "No MCP token" });
emit({
type: "tool_result",
name: tc.name,
result: result.slice(0, 500),
});
messages.push({
role: "tool",
content: result,
toolCallId: tc.id,
toolName: tc.name,
thoughtSignature: tc.thoughtSignature,
});
}
}
}
emit({ type: "phase", phase, label: "Investigating & Planning" });
try {
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
findings: "Evaluating...",
});
checkpointEmitted = true;
suppressNextTextAsCheckpoint = true;
phase = "execute";
emit({ type: "phase", phase, label: "Executing Code Edits" });
continue; // Skip tool execution and re-prompt
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
return;
}
// Stream user-facing text to client
if (resp.text) {
// Stream user-facing text to client.
// If this round's text is the forced Phase Checkpoint, route it to
// the hidden thinking channel and DON'T add it to the user-facing
// message (so it never shows live or in the persisted thread).
if (resp.text && suppressNextTextAsCheckpoint) {
emit({ type: "thinking", text: resp.text });
suppressNextTextAsCheckpoint = false;
} else if (resp.text) {
assistantText += (assistantText ? "\n\n" : "") + resp.text;
assistantTextSegments.push(resp.text);
emit({ type: "text", text: resp.text });
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
emit({ type: "aborted" });
}
// ── Acceptance verification + corrective fix-loop (flag-gated) ──
// After a turn that mutated code, run the verification contract
// (baseline: build + server_up + console_clean). If it fails, feed the
// concrete failures back and let the model fix — iterating until green,
// stuck, or out of attempts. Off by default; enable per-environment
// with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
let verificationOutcome: ExecuteTaskOutcome | null = null;
const MUTATION_TOOLS = [
"fs_write",
"fs_edit",
"fs_delete",
"apps_deploy",
"ship",
];
const mutated = assistantToolCalls.some((tc) =>
MUTATION_TOOLS.includes(tc.name),
);
if (
process.env.VIBN_VERIFICATION_ENABLED === "1" &&
!aborted &&
mutated &&
activeProject?.id &&
activeMcpToken
) {
emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
const previewUrl = extractPreviewUrl(messages);
const verifyExec: ToolExecutor = async (name, args) =>
executeMcpTool(
name,
args,
activeMcpToken,
baseUrl,
activeProject!.id,
);
const vTask: VerificationTask = {
id: thread_id,
title: message,
status: "in_progress",
acceptanceChecks: [],
attempts: 0,
};
const verifyCtx: ExecCtx = {
projectId: activeProject.id,
previewUrl,
exec: verifyExec,
};
try {
verificationOutcome = await executeTask(vTask, {
maxAttempts: 3,
runExecution: async ({ failureFeedback, attempt }) => {
// Attempt 1 = verify what the main loop already produced.
if (attempt === 1 && !failureFeedback) return;
if (failureFeedback)
messages.push({ role: "user", content: failureFeedback });
await runFixRounds(2);
},
verify: async () => runVerificationContract(vTask, verifyCtx),
});
} catch (e) {
console.error("[Verification] errored:", e);
}
// If verification couldn't reach green, surface the specific failing
// checks as an honest status (and let the summary reflect reality).
if (verificationOutcome?.status === "blocked") {
const checkLines = verificationOutcome.failures
.map((f) => `- ${f.check.description}: ${f.evidence}`)
.join("\n");
const note =
`I made the changes but verification didn't fully pass:\n${checkLines}\n` +
`That's the honest state — want me to keep working these specific issues?`;
assistantText += (assistantText ? "\n\n" : "") + note;
assistantTextSegments.push(note);
emit({ type: "text", text: note });
}
}
// If the loop ended with the user staring at a tool tray and no
// narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
// detected loop, or the model voluntarily stopped emitting tools
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
assistantTextSegments.push(fallback);
emit({ type: "text", text: fallback });
}
} else if (!aborted && anyToolsExecuted) {
// Successful tool-using turn — guarantee it ENDS with a clean,
// human summary. We only force one when the model didn't already
// close with a substantive sentence, so we never pay for a
// redundant double-summary.
const lastSeg = (
assistantTextSegments[assistantTextSegments.length - 1] || ""
).trim();
const alreadySummarized =
lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
if (!alreadySummarized) {
try {
const finalSummary = await callVibnChat({
systemPrompt:
systemPrompt +
`\n\n[FINAL SUMMARY] The work for this turn is finished. In 13 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
messages,
tools: [],
temperature: 0.3,
});
if (finalSummary.text && finalSummary.text.trim()) {
assistantText +=
(assistantText ? "\n\n" : "") + finalSummary.text;
assistantTextSegments.push(finalSummary.text);
emit({ type: "text", text: finalSummary.text });
}
} catch {
// Best-effort: the model's own final text remains as the ending.
}
}
}
// Last-resort guard: the model produced NO user-facing text and NO

View File

@@ -551,9 +551,17 @@ function ThinkingBubble({ thoughts }: { thoughts: string }) {
function stripRawToolLogs(text: string): string {
if (!text) return text;
return text
.replace(/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g, "")
.trim();
let out = text.replace(
/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g,
"",
);
// Safety net: strip the internal "Phase Checkpoint" planning block
// (Goal / Current Findings / Suspected Cause / Verification Plan) if it
// ever reaches a user-facing message. This is loop-control machinery, not
// something the end user should read. We drop from the heading to the end
// of that block (until a blank line followed by non-bulleted prose, or EOF).
out = out.replace(/(?:^|\n)\s*#{0,3}\s*Phase Checkpoint[\s\S]*$/i, "").trim();
return out.trim();
}
const MessageBubble = React.memo(function MessageBubble({
@@ -748,32 +756,8 @@ function Timeline({ entries }: { entries: TimelineEntry[] }) {
);
}
if (item.kind === "checkpoint") {
return (
<div
key={i}
style={{
margin: "6px 0 12px",
padding: "12px 14px",
background: "oklch(0.20 0.04 35 / 0.15)",
border: "1px dashed var(--accent)",
borderRadius: 8,
fontSize: "0.75rem",
color: "var(--fg-mute)",
fontFamily: "var(--font-mono), monospace",
}}
>
<div
style={{
color: "var(--accent)",
fontWeight: "bold",
marginBottom: 4,
}}
>
[Checkpoint Logged]
</div>
<div style={{ opacity: 0.8 }}>{item.goal}</div>
</div>
);
// Internal loop-control machinery — never shown to the user.
return null;
}
return (
<TimelineToolGroup

View File

@@ -0,0 +1,127 @@
/**
* Task executor — the iterate-to-green loop.
*
* EXECUTE (model edits toward the goal, with prior failures as context)
* → TEST (run the verification contract)
* → pass? → FINALIZE (task done)
* → fail? → KEEP FIXING (feed concrete failures back)
* → stuck? → ESCALATE (re-plan or honest blocker to the user)
*
* This module is pure orchestration over injected dependencies, so the
* finalize / keep-fixing / escalate decisions are fully unit-testable without
* a live dev container.
*/
import type { CheckResult, VerificationReport, VerificationTask } from "./types";
import { failureSignature, formatFailureFeedback } from "./generation";
export interface ExecuteTaskDeps {
/**
* Run one execution pass: let the model make edits toward the task's goal.
* `failureFeedback` is the structured "[VERIFICATION FAILED] …" message from
* the previous attempt (empty on the first attempt).
*/
runExecution: (args: {
task: VerificationTask;
failureFeedback: string;
attempt: number;
}) => Promise<void>;
/** Run the verification contract and return a structured report. */
verify: (task: VerificationTask) => Promise<VerificationReport>;
/** Persist task progress (attempts + lastFailures) so a turn can resume. */
persist?: (task: VerificationTask) => void | Promise<void>;
/** Max execute→verify cycles before escalating. Default 5. */
maxAttempts?: number;
/** Stop after this many consecutive no-progress attempts. Default 2. */
noProgressLimit?: number;
}
export type ExecuteTaskOutcome =
| { status: "done"; report: VerificationReport; attempts: number }
| {
status: "blocked";
report: VerificationReport | null;
attempts: number;
reason: string;
failures: CheckResult[];
};
export async function executeTask(
task: VerificationTask,
deps: ExecuteTaskDeps,
): Promise<ExecuteTaskOutcome> {
const maxAttempts = deps.maxAttempts ?? 5;
const noProgressLimit = deps.noProgressLimit ?? 2;
task.status = "in_progress";
let prevSig: string | null = null;
let noProgressStreak = 0;
let lastReport: VerificationReport | null = null;
while (task.attempts < maxAttempts) {
task.attempts++;
// EXECUTE — with the prior failures fed back as concrete instructions.
const failureFeedback = task.lastFailures?.length
? formatFailureFeedback(task.lastFailures)
: "";
await deps.runExecution({
task,
failureFeedback,
attempt: task.attempts,
});
// TEST
const report = await deps.verify(task);
lastReport = report;
if (report.passed) {
// FINALIZE
task.status = "done";
task.lastFailures = [];
await deps.persist?.(task);
return { status: "done", report, attempts: task.attempts };
}
// KEEP FIXING — persist the concrete failures so the next attempt (even in
// a later HTTP turn) resumes with full context.
task.lastFailures = report.failures;
await deps.persist?.(task);
// Detect no progress: the same hard failures with the same evidence.
const sig = failureSignature(report.failures);
if (prevSig !== null && sig === prevSig) {
noProgressStreak++;
} else {
noProgressStreak = 0;
}
prevSig = sig;
if (noProgressStreak >= noProgressLimit) {
task.status = "blocked";
await deps.persist?.(task);
return {
status: "blocked",
report,
attempts: task.attempts,
reason: "no_progress",
failures: report.failures,
};
}
}
// Hit the attempt ceiling without going green.
task.status = "blocked";
await deps.persist?.(task);
return {
status: "blocked",
report: lastReport,
attempts: task.attempts,
reason: "max_attempts",
failures: lastReport?.failures ?? [],
};
}

View File

@@ -0,0 +1,108 @@
/**
* Acceptance-check generation + feedback formatting.
*
* - The Planner emits `acceptanceChecks` per task using a strict schema; we
* validate/normalize that output here (models are not trustworthy emitters).
* - On a failed verification we format the failures into concrete, structured
* feedback that the next execution round consumes — this is what makes the
* model FIX rather than guess.
*/
import type { AcceptanceCheck, CheckKind, CheckResult } from "./types";
const VALID_KINDS: CheckKind[] = [
"build",
"typecheck",
"test",
"server_up",
"route_ok",
"console_clean",
"content",
"flow",
"visual",
"data",
];
// Soft-by-default kinds (advisory, never block "done").
const SOFT_KINDS = new Set<CheckKind>(["visual"]);
/**
* Validate and normalize a raw `acceptanceChecks` array from the model.
* Drops unknown kinds, coerces missing fields, and caps the count.
*/
export function normalizeAcceptanceChecks(raw: unknown): AcceptanceCheck[] {
if (!Array.isArray(raw)) return [];
const out: AcceptanceCheck[] = [];
for (const item of raw) {
if (!item || typeof item !== "object") continue;
const o = item as Record<string, unknown>;
const kind = o.kind as CheckKind;
if (!VALID_KINDS.includes(kind)) continue;
const spec =
o.spec && typeof o.spec === "object"
? (o.spec as Record<string, unknown>)
: {};
const hard =
typeof o.hard === "boolean" ? o.hard : !SOFT_KINDS.has(kind);
const description =
typeof o.description === "string" && o.description.trim()
? o.description.trim()
: kind;
out.push({ kind, hard, description, spec });
if (out.length >= 3) break; // keep contracts tight (13 checks)
}
return out;
}
/**
* Instruction appended to the Planner's system prompt so each task it creates
* carries a checkable contract.
*/
export const CHECK_GENERATION_PROMPT = `
[ACCEPTANCE CHECKS] For every task you create, attach \`acceptanceChecks\`: a JSON
array of 13 checks that objectively prove THIS task is done.
Each check: { "kind": <kind>, "hard": <bool>, "description": <string>, "spec": { ... } }
Allowed kinds and their spec:
- build spec: {} (compiles)
- typecheck spec: {} (no type errors)
- test spec: { command?: string } (tests pass)
- server_up spec: { port?: number } (app boots, 200)
- route_ok spec: { url: string, expectedStatus?: number }
- console_clean spec: { url?: string } (no JS errors)
- content spec: { url: string, contains: string } (text present)
- flow spec: { startUrl: string, expectContains: string }
- visual spec: { targetPath: string, minScore?: number } (soft)
- data spec: { command: string } (records exist)
Rules:
- build + server_up + console_clean are added AUTOMATICALLY. Do NOT repeat them.
- Add only checks that prove THIS task's specific behavior.
- Prefer the cheapest proof: route_ok/content over flow, flow over visual.
- If a task is not objectively verifiable (e.g. "make the copy friendlier"),
return an empty acceptanceChecks array and set "requiresHumanConfirm": true.
Do NOT fabricate a check you cannot actually verify.
`.trim();
/**
* Turn hard failures into specific, actionable feedback for the next execution
* round. Not "it didn't work" — the exact check, evidence, and a directive.
*/
export function formatFailureFeedback(failures: CheckResult[]): string {
if (!failures.length) return "";
const lines = failures.map(
(f) => `- ${f.check.kind} (${f.check.description}): FAILED — ${f.evidence}`,
);
return (
"[VERIFICATION FAILED] Your last changes did not pass these checks:\n" +
lines.join("\n") +
"\nFix these specific failures. Do not claim success until every check passes. " +
"Address the exact errors above — read the relevant files first if needed."
);
}
/** Stable signature of a report's hard failures — used to detect no-progress. */
export function failureSignature(failures: CheckResult[]): string {
return failures
.map((f) => `${f.check.kind}:${f.evidence}`)
.sort()
.join(";;");
}

View File

@@ -0,0 +1,92 @@
/**
* Verification harness — runs a task's contract and returns a structured
* pass/fail report. This is the single source of truth for "is the task done".
*/
import type {
AcceptanceCheck,
CheckResult,
ExecCtx,
VerificationReport,
VerificationTask,
} from "./types";
import { runCheck } from "./runners";
/**
* The baseline contract auto-attached to every code task. Even if the Planner
* specifies no checks, a task can never be "done" while the app fails to build
* or the page throws — this is the floor that kills false-completion
* ("I scaffolded everything ✓" when nothing compiles).
*/
export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
const checks: AcceptanceCheck[] = [
{
kind: "build",
hard: true,
description: "Project builds without errors",
spec: {},
},
{
kind: "server_up",
hard: true,
description: "Dev server boots and responds 200",
spec: { port: 3000 },
},
];
// console_clean needs a URL to check. Only include it when we actually know
// the preview URL — otherwise we'd fail the whole contract on an un-runnable
// check. (When run inside the agent, the URL comes from dev_server_start.)
if (previewUrl) {
checks.push({
kind: "console_clean",
hard: true,
description: "Preview has no runtime console errors",
spec: { url: previewUrl },
});
}
return checks;
}
const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;
/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
export function withBaseline(
checks: AcceptanceCheck[],
previewUrl?: string,
): AcceptanceCheck[] {
const seen = new Set(checks.map(KEY));
const merged = [...checks];
for (const b of baselineChecks(previewUrl)) {
if (!seen.has(KEY(b))) merged.push(b);
}
// Run hard checks first so we short-circuit on the cheapest objective failure.
return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
}
export interface RunContractOptions {
/** Skip the auto-baseline (e.g. for a pure data/research task). */
noBaseline?: boolean;
/** Stop after the first HARD failure (cheaper). Default true. */
shortCircuit?: boolean;
}
export async function runVerificationContract(
task: VerificationTask,
ctx: ExecCtx,
opts: RunContractOptions = {},
): Promise<VerificationReport> {
const { noBaseline = false, shortCircuit = true } = opts;
const checks = noBaseline
? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
: withBaseline(task.acceptanceChecks, ctx.previewUrl);
const results: CheckResult[] = [];
for (const check of checks) {
const r = await runCheck(check, ctx);
results.push(r);
if (shortCircuit && !r.pass && check.hard) break;
}
const failures = results.filter((r) => !r.pass && r.check.hard);
return { passed: failures.length === 0, results, failures };
}

View File

@@ -0,0 +1,5 @@
export * from "./types";
export * from "./runners";
export * from "./harness";
export * from "./generation";
export * from "./executor";

View File

@@ -0,0 +1,269 @@
/**
* Acceptance check runners.
*
* Each runner maps a single AcceptanceCheck to a deterministic tool invocation
* and returns a structured { pass, evidence }. Runners depend only on the
* injected ToolExecutor, so they are fully unit-testable with mocked outputs.
*/
import type {
AcceptanceCheck,
CheckKind,
CheckResult,
ExecCtx,
} from "./types";
// ── helpers ────────────────────────────────────────────────────────────────
export function redact(s: string): string {
return s
.replace(
/postgres(?:ql)?:\/\/[^:\s]+:[^@\s]+@[^/\s]+\/[^\s"']+/gi,
"postgresql://[REDACTED_DB_URL]",
)
.replace(
/eyJ[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}/g,
"[REDACTED_JWT]",
)
.replace(/\b[A-Za-z0-9_-]{40,}\b/g, "[REDACTED_SECRET]");
}
export function clip(s: string, n = 400): string {
const out = redact(String(s ?? "").replace(/\s+/g, " ").trim());
return out.length > n ? out.slice(0, n) + "…" : out;
}
/**
* Parse a raw tool result into a normalized shape. Tool results come back as a
* JSON string; shapes vary by tool, so we extract defensively. Some tools
* double-wrap (a `stdout` field that is itself JSON) — we unwrap one level.
*/
export interface ParsedToolResult {
code: number | null;
stdout: string;
stderr: string;
status: number | null; // healthCheck.status, etc.
raw: string;
obj: Record<string, unknown> | null;
}
export function parseToolResult(raw: string): ParsedToolResult {
const base: ParsedToolResult = {
code: null,
stdout: "",
stderr: "",
status: null,
raw: String(raw ?? ""),
obj: null,
};
let obj: Record<string, unknown> | null = null;
try {
const p = JSON.parse(base.raw);
if (p && typeof p === "object") obj = p as Record<string, unknown>;
} catch {
return base;
}
if (!obj) return base;
base.obj = obj;
// Some wrappers nest the real payload under `stdout` as a JSON string.
let target = obj;
if (
typeof obj.stdout === "string" &&
obj.code === undefined &&
obj.healthCheck === undefined
) {
try {
const inner = JSON.parse(obj.stdout);
if (inner && typeof inner === "object")
target = inner as Record<string, unknown>;
} catch {
/* stdout is plain text, keep outer */
}
}
if (typeof target.code === "number") base.code = target.code;
if (typeof target.exitCode === "number") base.code = target.exitCode;
if (typeof target.stdout === "string") base.stdout = target.stdout;
if (typeof target.stderr === "string") base.stderr = target.stderr;
const hc = target.healthCheck as { status?: number } | undefined;
if (hc && typeof hc.status === "number") base.status = hc.status;
if (typeof target.status === "number") base.status = target.status;
return base;
}
function ok(check: AcceptanceCheck, evidence = "passed"): CheckResult {
return { check, pass: true, evidence: clip(evidence) };
}
function fail(check: AcceptanceCheck, evidence: string): CheckResult {
return { check, pass: false, evidence: clip(evidence) };
}
function str(spec: Record<string, unknown>, key: string, dflt = ""): string {
const v = spec[key];
return typeof v === "string" ? v : dflt;
}
function num(
spec: Record<string, unknown>,
key: string,
dflt: number,
): number {
const v = spec[key];
return typeof v === "number" ? v : dflt;
}
// ── runners ────────────────────────────────────────────────────────────────
async function runShellExit(
check: AcceptanceCheck,
ctx: ExecCtx,
command: string,
label: string,
): Promise<CheckResult> {
const raw = await ctx.exec("shell_exec", {
projectId: ctx.projectId,
command,
});
const r = parseToolResult(raw);
if (r.code === 0) return ok(check, `${label} passed`);
const detail = r.stderr || r.stdout || `exit ${r.code ?? "?"}`;
return fail(check, `${label} failed (exit ${r.code ?? "?"}): ${detail}`);
}
const RUNNERS: Record<
CheckKind,
(check: AcceptanceCheck, ctx: ExecCtx) => Promise<CheckResult>
> = {
build: (c, ctx) =>
runShellExit(c, ctx, str(c.spec, "command", "npm run build"), "build"),
typecheck: (c, ctx) =>
runShellExit(
c,
ctx,
str(c.spec, "command", "npx tsc --noEmit"),
"typecheck",
),
test: (c, ctx) =>
runShellExit(c, ctx, str(c.spec, "command", "npm test"), "tests"),
data: (c, ctx) =>
runShellExit(c, ctx, str(c.spec, "command", ""), "data check"),
server_up: async (c, ctx) => {
const raw = await ctx.exec("dev_server_start", {
projectId: ctx.projectId,
command: str(c.spec, "command", "npm run dev"),
port: num(c.spec, "port", 3000),
});
const r = parseToolResult(raw);
if (r.status === 200) return ok(c, "dev server returned 200");
return fail(
c,
`dev server not healthy (status ${r.status ?? "none"}): ${
r.stderr || r.stdout || r.raw
}`,
);
},
route_ok: async (c, ctx) => {
const url = str(c.spec, "url");
const expected = num(c.spec, "expectedStatus", 200);
if (!url) return fail(c, "route_ok check is missing a url");
const raw = await ctx.exec("shell_exec", {
projectId: ctx.projectId,
command: `curl -s -o /dev/null -w "%{http_code}" --max-time 20 ${JSON.stringify(
url,
)}`,
});
const r = parseToolResult(raw);
const codeStr = (r.stdout || r.raw).trim().match(/\d{3}/)?.[0];
if (codeStr && Number(codeStr) === expected)
return ok(c, `${url}${codeStr}`);
return fail(c, `${url} returned ${codeStr ?? "no response"} (expected ${expected})`);
},
console_clean: async (c, ctx) => {
const url = str(c.spec, "url", ctx.previewUrl ?? "");
if (!url) return fail(c, "console_clean check is missing a url");
const raw = await ctx.exec("browser_console", { url });
const r = parseToolResult(raw);
const text = (r.raw || "").toLowerCase();
// Look for error-level console output or framework error overlays.
const errorHit =
/"type"\s*:\s*"error"/.test(text) ||
/\berror\b[^"]{0,40}(overlay|boundary|uncaught|unhandled)/.test(text) ||
/failed to compile|module not found|referenceerror|typeerror:/.test(text);
if (!errorHit) return ok(c, "no console errors");
return fail(c, `console errors on ${url}: ${clip(r.raw, 240)}`);
},
content: async (c, ctx) => {
const url = str(c.spec, "url", ctx.previewUrl ?? "");
const needle = str(c.spec, "contains");
if (!url || !needle)
return fail(c, "content check requires both `url` and `contains`");
const raw = await ctx.exec("shell_exec", {
projectId: ctx.projectId,
command: `curl -s --max-time 20 ${JSON.stringify(url)}`,
});
const r = parseToolResult(raw);
const body = r.stdout || r.raw;
if (body.includes(needle)) return ok(c, `found "${needle}"`);
return fail(c, `"${needle}" not found on ${url}`);
},
flow: async (c, ctx) => {
// A basic journey assertion: navigate to startUrl, then assert the page
// body contains `expectContains` (or that a follow URL is reachable).
const startUrl = str(c.spec, "startUrl", ctx.previewUrl ?? "");
const expectContains = str(c.spec, "expectContains");
if (!startUrl) return fail(c, "flow check is missing a startUrl");
const raw = await ctx.exec("browser_navigate", { url: startUrl });
const r = parseToolResult(raw);
const body = (r.stdout || r.raw).toString();
if (expectContains && !body.includes(expectContains))
return fail(c, `flow on ${startUrl}: did not reach "${expectContains}"`);
if (/error|cannot|failed/i.test(body) && !expectContains)
return fail(c, `flow on ${startUrl} hit an error page`);
return ok(c, `flow reached expected state`);
},
visual: async (c, ctx) => {
const targetPath = str(c.spec, "targetPath");
if (!targetPath) return fail(c, "visual check is missing a targetPath");
const raw = await ctx.exec("request_visual_qa", {
projectId: ctx.projectId,
targetPath,
});
const r = parseToolResult(raw);
const obj = r.obj as { score?: number; passed?: boolean } | null;
const threshold = num(c.spec, "minScore", 7);
if (obj?.passed === true) return ok(c, "visual QA passed");
if (typeof obj?.score === "number")
return obj.score >= threshold
? ok(c, `visual QA score ${obj.score}`)
: fail(c, `visual QA score ${obj.score} < ${threshold}`);
// No structured score — treat as advisory pass (soft checks won't block).
return ok(c, "visual QA ran (no numeric score)");
},
};
export async function runCheck(
check: AcceptanceCheck,
ctx: ExecCtx,
): Promise<CheckResult> {
const runner = RUNNERS[check.kind];
if (!runner) return fail(check, `unknown check kind: ${check.kind}`);
try {
return await runner(check, ctx);
} catch (e) {
return fail(
check,
`check runner errored: ${e instanceof Error ? e.message : String(e)}`,
);
}
}

View File

@@ -0,0 +1,71 @@
/**
* Acceptance / Verification layer — types.
*
* A task is NOT "done" because the model stops calling tools; it is done when
* its Verification Contract passes. The contract is a small list of
* deterministic, machine-runnable checks attached to the task.
*/
export type CheckKind =
| "build" // code compiles (npm run build)
| "typecheck" // no type errors (tsc --noEmit)
| "test" // unit/integration tests pass (npm test)
| "server_up" // dev server boots and returns 200
| "route_ok" // a route/endpoint returns the expected status code
| "console_clean" // no runtime JS console errors on a page
| "content" // expected text/element present on a page
| "flow" // a user journey works (navigate + assert)
| "visual" // UI meets a design rubric (request_visual_qa)
| "data"; // seed/records exist (a query returns expected rows)
export interface AcceptanceCheck {
kind: CheckKind;
/** Hard checks gate "done". Soft checks are advisory and never block. */
hard: boolean;
/** Human-readable description shown in build-health reports. */
description: string;
/** Kind-specific parameters (command, url, expectedStatus, etc.). */
spec: Record<string, unknown>;
}
export interface CheckResult {
check: AcceptanceCheck;
pass: boolean;
/** Redacted, truncated evidence — fed back to the model on failure. */
evidence: string;
}
export interface VerificationReport {
/** True only when every HARD check passed. */
passed: boolean;
results: CheckResult[];
/** Hard failures only — these are what the model must fix. */
failures: CheckResult[];
}
export interface VerificationTask {
id: string;
title: string;
status: "open" | "in_progress" | "done" | "blocked";
acceptanceChecks: AcceptanceCheck[];
attempts: number;
lastFailures?: CheckResult[];
/** Tasks that can't be objectively verified (e.g. "make copy friendlier"). */
requiresHumanConfirm?: boolean;
}
/**
* Abstraction over the agent's tool execution. Returns the raw tool result
* string (usually JSON). Injecting this makes every runner unit-testable.
*/
export type ToolExecutor = (
name: string,
args: Record<string, unknown>,
) => Promise<string>;
export interface ExecCtx {
projectId: string;
/** Preview URL of the running dev server, when known. */
previewUrl?: string;
exec: ToolExecutor;
}

View File

@@ -0,0 +1,365 @@
import { describe, it, expect, vi } from "vitest";
import { parseToolResult, runCheck, clip, redact } from "./runners";
import { withBaseline, runVerificationContract } from "./harness";
import {
normalizeAcceptanceChecks,
formatFailureFeedback,
failureSignature,
} from "./generation";
import { executeTask } from "./executor";
import type {
AcceptanceCheck,
ExecCtx,
ToolExecutor,
VerificationReport,
VerificationTask,
} from "./types";
// A mock tool executor: maps a tool name to a canned raw result string.
function mockExec(map: Record<string, string>): ToolExecutor {
return async (name: string) => map[name] ?? "{}";
}
function ctx(map: Record<string, string>, previewUrl?: string): ExecCtx {
return { projectId: "p1", previewUrl, exec: mockExec(map) };
}
const check = (
kind: AcceptanceCheck["kind"],
spec: Record<string, unknown> = {},
hard = true,
): AcceptanceCheck => ({ kind, hard, description: kind, spec });
const task = (checks: AcceptanceCheck[]): VerificationTask => ({
id: "t1",
title: "Test task",
status: "open",
acceptanceChecks: checks,
attempts: 0,
});
// ── parsing ──────────────────────────────────────────────────────────────
describe("parseToolResult", () => {
it("extracts code from a shell result", () => {
const r = parseToolResult(
JSON.stringify({ code: 1, stdout: "", stderr: "boom" }),
);
expect(r.code).toBe(1);
expect(r.stderr).toBe("boom");
});
it("unwraps a double-nested stdout JSON payload", () => {
const r = parseToolResult(
JSON.stringify({ stdout: JSON.stringify({ code: 0, stdout: "ok" }) }),
);
expect(r.code).toBe(0);
expect(r.stdout).toBe("ok");
});
it("reads healthCheck.status for server checks", () => {
const r = parseToolResult(
JSON.stringify({ previewUrl: "x", healthCheck: { status: 200 } }),
);
expect(r.status).toBe(200);
});
it("survives non-JSON", () => {
const r = parseToolResult("not json");
expect(r.code).toBeNull();
expect(r.raw).toBe("not json");
});
});
describe("redaction", () => {
it("redacts db urls and jwts and long secrets", () => {
const s = redact(
"db postgresql://u:p4ssword@host:5432/mydb token eyJhbGciOiJIUzI1.eyJzdWIiOjEy.SflKxwRJSMeKKF secret sk_live_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789xyz",
);
expect(s).toContain("[REDACTED_DB_URL]");
expect(s).toContain("[REDACTED_JWT]");
expect(s).toContain("[REDACTED_SECRET]");
});
it("clip truncates and trims long non-secret text", () => {
const long = "the quick brown fox jumps over the lazy dog. ".repeat(20);
expect(clip(long).endsWith("…")).toBe(true);
expect(clip(long).length).toBeLessThanOrEqual(401);
});
});
// ── runners ──────────────────────────────────────────────────────────────
describe("runners", () => {
it("build passes on exit 0, fails on non-zero with stderr", async () => {
const pass = await runCheck(
check("build"),
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "done" }) }),
);
expect(pass.pass).toBe(true);
const failR = await runCheck(
check("build"),
ctx({
shell_exec: JSON.stringify({
code: 1,
stderr: "Type error on auth.ts:14",
}),
}),
);
expect(failR.pass).toBe(false);
expect(failR.evidence).toContain("auth.ts:14");
});
it("server_up passes on 200, fails otherwise", async () => {
const pass = await runCheck(
check("server_up"),
ctx({
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
}),
);
expect(pass.pass).toBe(true);
const failR = await runCheck(
check("server_up"),
ctx({
dev_server_start: JSON.stringify({ healthCheck: { status: 502 } }),
}),
);
expect(failR.pass).toBe(false);
});
it("route_ok matches the expected status code", async () => {
const pass = await runCheck(
check("route_ok", { url: "http://x/dashboard", expectedStatus: 200 }),
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "200" }) }),
);
expect(pass.pass).toBe(true);
const failR = await runCheck(
check("route_ok", { url: "http://x/dashboard" }),
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "404" }) }),
);
expect(failR.pass).toBe(false);
expect(failR.evidence).toContain("404");
});
it("console_clean fails when an error is present", async () => {
const failR = await runCheck(
check("console_clean", { url: "http://x" }),
ctx({
browser_console: JSON.stringify([{ type: "error", text: "boom" }]),
}),
);
expect(failR.pass).toBe(false);
const pass = await runCheck(
check("console_clean", { url: "http://x" }),
ctx({ browser_console: JSON.stringify([{ type: "log", text: "ok" }]) }),
);
expect(pass.pass).toBe(true);
});
it("content checks for a substring", async () => {
const pass = await runCheck(
check("content", { url: "http://x", contains: "GetAcquired" }),
ctx({
shell_exec: JSON.stringify({ code: 0, stdout: "<h1>GetAcquired</h1>" }),
}),
);
expect(pass.pass).toBe(true);
});
});
// ── harness ──────────────────────────────────────────────────────────────
describe("harness", () => {
it("auto-attaches the baseline contract", () => {
const merged = withBaseline([], "http://preview");
const kinds = merged.map((c) => c.kind).sort();
expect(kinds).toContain("build");
expect(kinds).toContain("server_up");
expect(kinds).toContain("console_clean");
});
it("does not duplicate a baseline check the planner already specified", () => {
const merged = withBaseline([check("build")]);
expect(merged.filter((c) => c.kind === "build").length).toBe(1);
});
it("reports passed only when all hard checks pass", async () => {
const report = await runVerificationContract(
task([check("route_ok", { url: "http://x/d" })]),
ctx({
shell_exec: JSON.stringify({ code: 0, stdout: "200" }),
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
browser_console: JSON.stringify([]),
}),
{ shortCircuit: false },
);
expect(report.passed).toBe(true);
expect(report.failures.length).toBe(0);
});
it("short-circuits on the first hard failure", async () => {
const exec = vi.fn(async (name: string) => {
if (name === "shell_exec")
return JSON.stringify({ code: 1, stderr: "build broke" });
return "{}";
});
const report = await runVerificationContract(
task([]),
{ projectId: "p1", exec },
{ shortCircuit: true },
);
expect(report.passed).toBe(false);
// build is the first hard check; we should NOT have called dev_server_start.
expect(exec).toHaveBeenCalledWith("shell_exec", expect.anything());
expect(exec).not.toHaveBeenCalledWith(
"dev_server_start",
expect.anything(),
);
});
it("soft check failure does NOT block done", async () => {
const report = await runVerificationContract(
task([check("visual", { targetPath: "x" }, false)]),
ctx({
shell_exec: JSON.stringify({ code: 0 }),
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
browser_console: JSON.stringify([]),
request_visual_qa: JSON.stringify({ score: 2 }),
}),
{ shortCircuit: false },
);
// visual scored 2 (would fail) but it's soft → does not block.
expect(report.passed).toBe(true);
});
});
// ── generation ───────────────────────────────────────────────────────────
describe("generation", () => {
it("normalizes and caps acceptance checks, dropping unknown kinds", () => {
const out = normalizeAcceptanceChecks([
{ kind: "route_ok", spec: { url: "x" } },
{ kind: "bogus" },
{ kind: "content", spec: { url: "x", contains: "y" } },
{ kind: "build" },
{ kind: "data", spec: { command: "q" } },
]);
expect(out.length).toBe(3); // capped
expect(out.find((c) => c.kind === ("bogus" as never))).toBeUndefined();
});
it("defaults visual to a soft check", () => {
const out = normalizeAcceptanceChecks([{ kind: "visual", spec: {} }]);
expect(out[0].hard).toBe(false);
});
it("formats actionable failure feedback", () => {
const fb = formatFailureFeedback([
{
check: check("build"),
pass: false,
evidence: "Cannot find name foo (auth.ts:14)",
},
]);
expect(fb).toContain("[VERIFICATION FAILED]");
expect(fb).toContain("auth.ts:14");
expect(fb).toContain("Do not claim success");
});
it("failure signatures are stable and order-independent", () => {
const a = failureSignature([
{ check: check("build"), pass: false, evidence: "x" },
{ check: check("route_ok"), pass: false, evidence: "y" },
]);
const b = failureSignature([
{ check: check("route_ok"), pass: false, evidence: "y" },
{ check: check("build"), pass: false, evidence: "x" },
]);
expect(a).toBe(b);
});
});
// ── executor fix-loop ────────────────────────────────────────────────────
const passReport = (): VerificationReport => ({
passed: true,
results: [],
failures: [],
});
const failReport = (evidence: string): VerificationReport => ({
passed: false,
results: [],
failures: [{ check: check("build"), pass: false, evidence }],
});
describe("executeTask fix-loop", () => {
it("FINALIZES immediately when the first verify passes", async () => {
const runExecution = vi.fn(async () => {});
const verify = vi.fn(async () => passReport());
const out = await executeTask(task([]), { runExecution, verify });
expect(out.status).toBe("done");
expect(out.attempts).toBe(1);
expect(runExecution).toHaveBeenCalledTimes(1);
});
it("KEEPS FIXING then finalizes when a later attempt passes", async () => {
const verify = vi
.fn()
.mockResolvedValueOnce(failReport("err A"))
.mockResolvedValueOnce(failReport("err B")) // different evidence = progress
.mockResolvedValueOnce(passReport());
const feedbacks: string[] = [];
const runExecution = vi.fn(async (a: { failureFeedback: string }) => {
feedbacks.push(a.failureFeedback);
});
const out = await executeTask(task([]), { runExecution, verify });
expect(out.status).toBe("done");
expect(out.attempts).toBe(3);
// The 2nd execution received the 1st attempt's concrete failure as context.
expect(feedbacks[1]).toContain("err A");
});
it("ESCALATES (blocked: no_progress) when the same failure repeats", async () => {
const verify = vi.fn(async () => failReport("same error"));
const runExecution = vi.fn(async () => {});
const out = await executeTask(task([]), {
runExecution,
verify,
noProgressLimit: 2,
});
expect(out.status).toBe("blocked");
if (out.status === "blocked") expect(out.reason).toBe("no_progress");
});
it("ESCALATES (blocked: max_attempts) if it never goes green but keeps changing", async () => {
let n = 0;
const verify = vi.fn(async () => failReport(`err ${n++}`)); // always different
const runExecution = vi.fn(async () => {});
const out = await executeTask(task([]), {
runExecution,
verify,
maxAttempts: 3,
});
expect(out.status).toBe("blocked");
if (out.status === "blocked") {
expect(out.reason).toBe("max_attempts");
expect(out.attempts).toBe(3);
}
});
it("persists progress on every attempt (resume support)", async () => {
const verify = vi
.fn()
.mockResolvedValueOnce(failReport("e1"))
.mockResolvedValueOnce(passReport());
const persisted: number[] = [];
const t = task([]);
await executeTask(t, {
runExecution: async () => {},
verify,
persist: (tk) => {
persisted.push(tk.attempts);
},
});
expect(persisted).toContain(1); // persisted the failing attempt
expect(t.status).toBe("done");
});
});