diff --git a/vibn-frontend/app/api/chat/route.ts b/vibn-frontend/app/api/chat/route.ts index b68bd76f..082cdd14 100644 --- a/vibn-frontend/app/api/chat/route.ts +++ b/vibn-frontend/app/api/chat/route.ts @@ -28,6 +28,14 @@ import { detectKnownError, formatRecoveryMessage, } from "@/lib/ai/error-recovery"; +import { + executeTask, + runVerificationContract, + type ExecCtx, + type ExecuteTaskOutcome, + type ToolExecutor, + type VerificationTask, +} from "@/lib/ai/verification"; // --- Agent Orchestration Types & Constants --- type TurnIntent = @@ -668,6 +676,35 @@ function buildHealthStatus(opts: { ); } +// Scan tool results (most-recent first) for a dev-server preview URL so the +// verification layer can run console/route checks against the running app. +function extractPreviewUrl(messages: ChatMessage[]): string | undefined { + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i]; + if (m.role !== "tool" || typeof m.content !== "string") continue; + if (!m.content.includes("preview")) continue; + try { + const p = JSON.parse(m.content) as Record; + if (typeof p.previewUrl === "string") return p.previewUrl; + if (typeof p.stdout === "string") { + try { + const inner = JSON.parse(p.stdout) as Record; + if (typeof inner.previewUrl === "string") return inner.previewUrl; + } catch { + /* stdout not JSON */ + } + } + } catch { + /* not JSON */ + } + const match = m.content.match( + /https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i, + ); + if (match) return match[0]; + } + return undefined; +} + export async function POST(request: Request) { await ensureChatTables(); @@ -1067,6 +1104,11 @@ export async function POST(request: Request) { let phase: AgentPhase = "recon"; let checkpointEmitted = false; let verificationPassed = false; + // When C-08 forces a "Phase Checkpoint" before a mutation, the model's + // next reply is that internal planning block. We route it to the + // (hidden) thinking channel instead of showing the user a wall of + // Goal/Findings/Suspected-Cause text. + let suppressNextTextAsCheckpoint = false; // ── Server-side conversational guard (C-03 enforcement) ─────────── // If the user's message looks conversational we withhold tools for @@ -1100,6 +1142,62 @@ export async function POST(request: Request) { let fileHashes = new Map(); let stallRounds = 0; + // Compact corrective executor used by the verification fix-loop: runs up + // to `n` model rounds (with tools) to fix whatever verification flagged, + // reusing the same tool-execution path as the main loop. + async function runFixRounds(n: number) { + for (let i = 0; i < n; i++) { + if (aborted) break; + const fixTools = activeMcpToken + ? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent) + : []; + const r = await callVibnChat({ + systemPrompt, + messages, + tools: fixTools, + temperature: 0.4, + includeThoughts: true, + }); + if (r.text) { + assistantText += (assistantText ? "\n\n" : "") + r.text; + assistantTextSegments.push(r.text); + emit({ type: "text", text: r.text }); + } + messages.push({ + role: "assistant", + content: r.text, + toolCalls: r.toolCalls.length ? r.toolCalls : undefined, + }); + if (!r.toolCalls.length) break; + for (const tc of r.toolCalls) { + if (aborted) break; + assistantToolCalls.push(tc); + emit({ type: "tool_start", name: tc.name, args: tc.args }); + const result = activeMcpToken + ? await executeMcpTool( + tc.name, + tc.args, + activeMcpToken, + baseUrl, + activeProject?.id, + ) + : JSON.stringify({ error: "No MCP token" }); + emit({ + type: "tool_result", + name: tc.name, + result: result.slice(0, 500), + }); + messages.push({ + role: "tool", + content: result, + toolCallId: tc.id, + toolName: tc.name, + thoughtSignature: tc.thoughtSignature, + }); + } + } + } + emit({ type: "phase", phase, label: "Investigating & Planning" }); try { @@ -1182,6 +1280,7 @@ export async function POST(request: Request) { findings: "Evaluating...", }); checkpointEmitted = true; + suppressNextTextAsCheckpoint = true; phase = "execute"; emit({ type: "phase", phase, label: "Executing Code Edits" }); continue; // Skip tool execution and re-prompt @@ -1202,8 +1301,14 @@ export async function POST(request: Request) { return; } - // Stream user-facing text to client - if (resp.text) { + // Stream user-facing text to client. + // If this round's text is the forced Phase Checkpoint, route it to + // the hidden thinking channel and DON'T add it to the user-facing + // message (so it never shows live or in the persisted thread). + if (resp.text && suppressNextTextAsCheckpoint) { + emit({ type: "thinking", text: resp.text }); + suppressNextTextAsCheckpoint = false; + } else if (resp.text) { assistantText += (assistantText ? "\n\n" : "") + resp.text; assistantTextSegments.push(resp.text); emit({ type: "text", text: resp.text }); @@ -1420,6 +1525,82 @@ export async function POST(request: Request) { emit({ type: "aborted" }); } + // ── Acceptance verification + corrective fix-loop (flag-gated) ── + // After a turn that mutated code, run the verification contract + // (baseline: build + server_up + console_clean). If it fails, feed the + // concrete failures back and let the model fix — iterating until green, + // stuck, or out of attempts. Off by default; enable per-environment + // with VIBN_VERIFICATION_ENABLED=1 for the live smoke test. + let verificationOutcome: ExecuteTaskOutcome | null = null; + const MUTATION_TOOLS = [ + "fs_write", + "fs_edit", + "fs_delete", + "apps_deploy", + "ship", + ]; + const mutated = assistantToolCalls.some((tc) => + MUTATION_TOOLS.includes(tc.name), + ); + if ( + process.env.VIBN_VERIFICATION_ENABLED === "1" && + !aborted && + mutated && + activeProject?.id && + activeMcpToken + ) { + emit({ type: "phase", phase: "verify", label: "Verifying & fixing" }); + const previewUrl = extractPreviewUrl(messages); + const verifyExec: ToolExecutor = async (name, args) => + executeMcpTool( + name, + args, + activeMcpToken, + baseUrl, + activeProject!.id, + ); + const vTask: VerificationTask = { + id: thread_id, + title: message, + status: "in_progress", + acceptanceChecks: [], + attempts: 0, + }; + const verifyCtx: ExecCtx = { + projectId: activeProject.id, + previewUrl, + exec: verifyExec, + }; + try { + verificationOutcome = await executeTask(vTask, { + maxAttempts: 3, + runExecution: async ({ failureFeedback, attempt }) => { + // Attempt 1 = verify what the main loop already produced. + if (attempt === 1 && !failureFeedback) return; + if (failureFeedback) + messages.push({ role: "user", content: failureFeedback }); + await runFixRounds(2); + }, + verify: async () => runVerificationContract(vTask, verifyCtx), + }); + } catch (e) { + console.error("[Verification] errored:", e); + } + // If verification couldn't reach green, surface the specific failing + // checks as an honest status (and let the summary reflect reality). + if (verificationOutcome?.status === "blocked") { + const checkLines = verificationOutcome.failures + .map((f) => `- ${f.check.description}: ${f.evidence}`) + .join("\n"); + const note = + `I made the changes but verification didn't fully pass:\n${checkLines}\n` + + `That's the honest state — want me to keep working these specific issues?`; + assistantText += (assistantText ? "\n\n" : "") + note; + assistantTextSegments.push(note); + emit({ type: "text", text: note }); + } + } + // If the loop ended with the user staring at a tool tray and no // narrative — whether because we hit MAX_TOOL_ROUNDS, broke a // detected loop, or the model voluntarily stopped emitting tools @@ -1492,6 +1673,36 @@ export async function POST(request: Request) { assistantTextSegments.push(fallback); emit({ type: "text", text: fallback }); } + } else if (!aborted && anyToolsExecuted) { + // Successful tool-using turn — guarantee it ENDS with a clean, + // human summary. We only force one when the model didn't already + // close with a substantive sentence, so we never pay for a + // redundant double-summary. + const lastSeg = ( + assistantTextSegments[assistantTextSegments.length - 1] || "" + ).trim(); + const alreadySummarized = + lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg); + if (!alreadySummarized) { + try { + const finalSummary = await callVibnChat({ + systemPrompt: + systemPrompt + + `\n\n[FINAL SUMMARY] The work for this turn is finished. In 1–3 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`, + messages, + tools: [], + temperature: 0.3, + }); + if (finalSummary.text && finalSummary.text.trim()) { + assistantText += + (assistantText ? "\n\n" : "") + finalSummary.text; + assistantTextSegments.push(finalSummary.text); + emit({ type: "text", text: finalSummary.text }); + } + } catch { + // Best-effort: the model's own final text remains as the ending. + } + } } // Last-resort guard: the model produced NO user-facing text and NO diff --git a/vibn-frontend/components/vibn-chat/chat-panel.tsx b/vibn-frontend/components/vibn-chat/chat-panel.tsx index 5d43e9be..84be8afa 100644 --- a/vibn-frontend/components/vibn-chat/chat-panel.tsx +++ b/vibn-frontend/components/vibn-chat/chat-panel.tsx @@ -551,9 +551,17 @@ function ThinkingBubble({ thoughts }: { thoughts: string }) { function stripRawToolLogs(text: string): string { if (!text) return text; - return text - .replace(/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g, "") - .trim(); + let out = text.replace( + /(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g, + "", + ); + // Safety net: strip the internal "Phase Checkpoint" planning block + // (Goal / Current Findings / Suspected Cause / Verification Plan) if it + // ever reaches a user-facing message. This is loop-control machinery, not + // something the end user should read. We drop from the heading to the end + // of that block (until a blank line followed by non-bulleted prose, or EOF). + out = out.replace(/(?:^|\n)\s*#{0,3}\s*Phase Checkpoint[\s\S]*$/i, "").trim(); + return out.trim(); } const MessageBubble = React.memo(function MessageBubble({ @@ -748,32 +756,8 @@ function Timeline({ entries }: { entries: TimelineEntry[] }) { ); } if (item.kind === "checkpoint") { - return ( -
-
- [Checkpoint Logged] -
-
{item.goal}
-
- ); + // Internal loop-control machinery — never shown to the user. + return null; } return ( Promise; + + /** Run the verification contract and return a structured report. */ + verify: (task: VerificationTask) => Promise; + + /** Persist task progress (attempts + lastFailures) so a turn can resume. */ + persist?: (task: VerificationTask) => void | Promise; + + /** Max execute→verify cycles before escalating. Default 5. */ + maxAttempts?: number; + + /** Stop after this many consecutive no-progress attempts. Default 2. */ + noProgressLimit?: number; +} + +export type ExecuteTaskOutcome = + | { status: "done"; report: VerificationReport; attempts: number } + | { + status: "blocked"; + report: VerificationReport | null; + attempts: number; + reason: string; + failures: CheckResult[]; + }; + +export async function executeTask( + task: VerificationTask, + deps: ExecuteTaskDeps, +): Promise { + const maxAttempts = deps.maxAttempts ?? 5; + const noProgressLimit = deps.noProgressLimit ?? 2; + + task.status = "in_progress"; + let prevSig: string | null = null; + let noProgressStreak = 0; + let lastReport: VerificationReport | null = null; + + while (task.attempts < maxAttempts) { + task.attempts++; + + // EXECUTE — with the prior failures fed back as concrete instructions. + const failureFeedback = task.lastFailures?.length + ? formatFailureFeedback(task.lastFailures) + : ""; + await deps.runExecution({ + task, + failureFeedback, + attempt: task.attempts, + }); + + // TEST + const report = await deps.verify(task); + lastReport = report; + + if (report.passed) { + // FINALIZE + task.status = "done"; + task.lastFailures = []; + await deps.persist?.(task); + return { status: "done", report, attempts: task.attempts }; + } + + // KEEP FIXING — persist the concrete failures so the next attempt (even in + // a later HTTP turn) resumes with full context. + task.lastFailures = report.failures; + await deps.persist?.(task); + + // Detect no progress: the same hard failures with the same evidence. + const sig = failureSignature(report.failures); + if (prevSig !== null && sig === prevSig) { + noProgressStreak++; + } else { + noProgressStreak = 0; + } + prevSig = sig; + + if (noProgressStreak >= noProgressLimit) { + task.status = "blocked"; + await deps.persist?.(task); + return { + status: "blocked", + report, + attempts: task.attempts, + reason: "no_progress", + failures: report.failures, + }; + } + } + + // Hit the attempt ceiling without going green. + task.status = "blocked"; + await deps.persist?.(task); + return { + status: "blocked", + report: lastReport, + attempts: task.attempts, + reason: "max_attempts", + failures: lastReport?.failures ?? [], + }; +} diff --git a/vibn-frontend/lib/ai/verification/generation.ts b/vibn-frontend/lib/ai/verification/generation.ts new file mode 100644 index 00000000..d8c91d37 --- /dev/null +++ b/vibn-frontend/lib/ai/verification/generation.ts @@ -0,0 +1,108 @@ +/** + * Acceptance-check generation + feedback formatting. + * + * - The Planner emits `acceptanceChecks` per task using a strict schema; we + * validate/normalize that output here (models are not trustworthy emitters). + * - On a failed verification we format the failures into concrete, structured + * feedback that the next execution round consumes — this is what makes the + * model FIX rather than guess. + */ + +import type { AcceptanceCheck, CheckKind, CheckResult } from "./types"; + +const VALID_KINDS: CheckKind[] = [ + "build", + "typecheck", + "test", + "server_up", + "route_ok", + "console_clean", + "content", + "flow", + "visual", + "data", +]; + +// Soft-by-default kinds (advisory, never block "done"). +const SOFT_KINDS = new Set(["visual"]); + +/** + * Validate and normalize a raw `acceptanceChecks` array from the model. + * Drops unknown kinds, coerces missing fields, and caps the count. + */ +export function normalizeAcceptanceChecks(raw: unknown): AcceptanceCheck[] { + if (!Array.isArray(raw)) return []; + const out: AcceptanceCheck[] = []; + for (const item of raw) { + if (!item || typeof item !== "object") continue; + const o = item as Record; + const kind = o.kind as CheckKind; + if (!VALID_KINDS.includes(kind)) continue; + const spec = + o.spec && typeof o.spec === "object" + ? (o.spec as Record) + : {}; + const hard = + typeof o.hard === "boolean" ? o.hard : !SOFT_KINDS.has(kind); + const description = + typeof o.description === "string" && o.description.trim() + ? o.description.trim() + : kind; + out.push({ kind, hard, description, spec }); + if (out.length >= 3) break; // keep contracts tight (1–3 checks) + } + return out; +} + +/** + * Instruction appended to the Planner's system prompt so each task it creates + * carries a checkable contract. + */ +export const CHECK_GENERATION_PROMPT = ` +[ACCEPTANCE CHECKS] For every task you create, attach \`acceptanceChecks\`: a JSON +array of 1–3 checks that objectively prove THIS task is done. +Each check: { "kind": , "hard": , "description": , "spec": { ... } } +Allowed kinds and their spec: +- build spec: {} (compiles) +- typecheck spec: {} (no type errors) +- test spec: { command?: string } (tests pass) +- server_up spec: { port?: number } (app boots, 200) +- route_ok spec: { url: string, expectedStatus?: number } +- console_clean spec: { url?: string } (no JS errors) +- content spec: { url: string, contains: string } (text present) +- flow spec: { startUrl: string, expectContains: string } +- visual spec: { targetPath: string, minScore?: number } (soft) +- data spec: { command: string } (records exist) +Rules: +- build + server_up + console_clean are added AUTOMATICALLY. Do NOT repeat them. +- Add only checks that prove THIS task's specific behavior. +- Prefer the cheapest proof: route_ok/content over flow, flow over visual. +- If a task is not objectively verifiable (e.g. "make the copy friendlier"), + return an empty acceptanceChecks array and set "requiresHumanConfirm": true. + Do NOT fabricate a check you cannot actually verify. +`.trim(); + +/** + * Turn hard failures into specific, actionable feedback for the next execution + * round. Not "it didn't work" — the exact check, evidence, and a directive. + */ +export function formatFailureFeedback(failures: CheckResult[]): string { + if (!failures.length) return ""; + const lines = failures.map( + (f) => `- ${f.check.kind} (${f.check.description}): FAILED — ${f.evidence}`, + ); + return ( + "[VERIFICATION FAILED] Your last changes did not pass these checks:\n" + + lines.join("\n") + + "\nFix these specific failures. Do not claim success until every check passes. " + + "Address the exact errors above — read the relevant files first if needed." + ); +} + +/** Stable signature of a report's hard failures — used to detect no-progress. */ +export function failureSignature(failures: CheckResult[]): string { + return failures + .map((f) => `${f.check.kind}:${f.evidence}`) + .sort() + .join(";;"); +} diff --git a/vibn-frontend/lib/ai/verification/harness.ts b/vibn-frontend/lib/ai/verification/harness.ts new file mode 100644 index 00000000..2197c0f2 --- /dev/null +++ b/vibn-frontend/lib/ai/verification/harness.ts @@ -0,0 +1,92 @@ +/** + * Verification harness — runs a task's contract and returns a structured + * pass/fail report. This is the single source of truth for "is the task done". + */ + +import type { + AcceptanceCheck, + CheckResult, + ExecCtx, + VerificationReport, + VerificationTask, +} from "./types"; +import { runCheck } from "./runners"; + +/** + * The baseline contract auto-attached to every code task. Even if the Planner + * specifies no checks, a task can never be "done" while the app fails to build + * or the page throws — this is the floor that kills false-completion + * ("I scaffolded everything ✓" when nothing compiles). + */ +export function baselineChecks(previewUrl?: string): AcceptanceCheck[] { + const checks: AcceptanceCheck[] = [ + { + kind: "build", + hard: true, + description: "Project builds without errors", + spec: {}, + }, + { + kind: "server_up", + hard: true, + description: "Dev server boots and responds 200", + spec: { port: 3000 }, + }, + ]; + // console_clean needs a URL to check. Only include it when we actually know + // the preview URL — otherwise we'd fail the whole contract on an un-runnable + // check. (When run inside the agent, the URL comes from dev_server_start.) + if (previewUrl) { + checks.push({ + kind: "console_clean", + hard: true, + description: "Preview has no runtime console errors", + spec: { url: previewUrl }, + }); + } + return checks; +} + +const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`; + +/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */ +export function withBaseline( + checks: AcceptanceCheck[], + previewUrl?: string, +): AcceptanceCheck[] { + const seen = new Set(checks.map(KEY)); + const merged = [...checks]; + for (const b of baselineChecks(previewUrl)) { + if (!seen.has(KEY(b))) merged.push(b); + } + // Run hard checks first so we short-circuit on the cheapest objective failure. + return merged.sort((a, b) => Number(b.hard) - Number(a.hard)); +} + +export interface RunContractOptions { + /** Skip the auto-baseline (e.g. for a pure data/research task). */ + noBaseline?: boolean; + /** Stop after the first HARD failure (cheaper). Default true. */ + shortCircuit?: boolean; +} + +export async function runVerificationContract( + task: VerificationTask, + ctx: ExecCtx, + opts: RunContractOptions = {}, +): Promise { + const { noBaseline = false, shortCircuit = true } = opts; + const checks = noBaseline + ? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard)) + : withBaseline(task.acceptanceChecks, ctx.previewUrl); + + const results: CheckResult[] = []; + for (const check of checks) { + const r = await runCheck(check, ctx); + results.push(r); + if (shortCircuit && !r.pass && check.hard) break; + } + + const failures = results.filter((r) => !r.pass && r.check.hard); + return { passed: failures.length === 0, results, failures }; +} diff --git a/vibn-frontend/lib/ai/verification/index.ts b/vibn-frontend/lib/ai/verification/index.ts new file mode 100644 index 00000000..07c23b19 --- /dev/null +++ b/vibn-frontend/lib/ai/verification/index.ts @@ -0,0 +1,5 @@ +export * from "./types"; +export * from "./runners"; +export * from "./harness"; +export * from "./generation"; +export * from "./executor"; diff --git a/vibn-frontend/lib/ai/verification/runners.ts b/vibn-frontend/lib/ai/verification/runners.ts new file mode 100644 index 00000000..6e4ee314 --- /dev/null +++ b/vibn-frontend/lib/ai/verification/runners.ts @@ -0,0 +1,269 @@ +/** + * Acceptance check runners. + * + * Each runner maps a single AcceptanceCheck to a deterministic tool invocation + * and returns a structured { pass, evidence }. Runners depend only on the + * injected ToolExecutor, so they are fully unit-testable with mocked outputs. + */ + +import type { + AcceptanceCheck, + CheckKind, + CheckResult, + ExecCtx, +} from "./types"; + +// ── helpers ──────────────────────────────────────────────────────────────── + +export function redact(s: string): string { + return s + .replace( + /postgres(?:ql)?:\/\/[^:\s]+:[^@\s]+@[^/\s]+\/[^\s"']+/gi, + "postgresql://[REDACTED_DB_URL]", + ) + .replace( + /eyJ[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}/g, + "[REDACTED_JWT]", + ) + .replace(/\b[A-Za-z0-9_-]{40,}\b/g, "[REDACTED_SECRET]"); +} + +export function clip(s: string, n = 400): string { + const out = redact(String(s ?? "").replace(/\s+/g, " ").trim()); + return out.length > n ? out.slice(0, n) + "…" : out; +} + +/** + * Parse a raw tool result into a normalized shape. Tool results come back as a + * JSON string; shapes vary by tool, so we extract defensively. Some tools + * double-wrap (a `stdout` field that is itself JSON) — we unwrap one level. + */ +export interface ParsedToolResult { + code: number | null; + stdout: string; + stderr: string; + status: number | null; // healthCheck.status, etc. + raw: string; + obj: Record | null; +} + +export function parseToolResult(raw: string): ParsedToolResult { + const base: ParsedToolResult = { + code: null, + stdout: "", + stderr: "", + status: null, + raw: String(raw ?? ""), + obj: null, + }; + let obj: Record | null = null; + try { + const p = JSON.parse(base.raw); + if (p && typeof p === "object") obj = p as Record; + } catch { + return base; + } + if (!obj) return base; + base.obj = obj; + + // Some wrappers nest the real payload under `stdout` as a JSON string. + let target = obj; + if ( + typeof obj.stdout === "string" && + obj.code === undefined && + obj.healthCheck === undefined + ) { + try { + const inner = JSON.parse(obj.stdout); + if (inner && typeof inner === "object") + target = inner as Record; + } catch { + /* stdout is plain text, keep outer */ + } + } + + if (typeof target.code === "number") base.code = target.code; + if (typeof target.exitCode === "number") base.code = target.exitCode; + if (typeof target.stdout === "string") base.stdout = target.stdout; + if (typeof target.stderr === "string") base.stderr = target.stderr; + + const hc = target.healthCheck as { status?: number } | undefined; + if (hc && typeof hc.status === "number") base.status = hc.status; + if (typeof target.status === "number") base.status = target.status; + + return base; +} + +function ok(check: AcceptanceCheck, evidence = "passed"): CheckResult { + return { check, pass: true, evidence: clip(evidence) }; +} +function fail(check: AcceptanceCheck, evidence: string): CheckResult { + return { check, pass: false, evidence: clip(evidence) }; +} + +function str(spec: Record, key: string, dflt = ""): string { + const v = spec[key]; + return typeof v === "string" ? v : dflt; +} +function num( + spec: Record, + key: string, + dflt: number, +): number { + const v = spec[key]; + return typeof v === "number" ? v : dflt; +} + +// ── runners ──────────────────────────────────────────────────────────────── + +async function runShellExit( + check: AcceptanceCheck, + ctx: ExecCtx, + command: string, + label: string, +): Promise { + const raw = await ctx.exec("shell_exec", { + projectId: ctx.projectId, + command, + }); + const r = parseToolResult(raw); + if (r.code === 0) return ok(check, `${label} passed`); + const detail = r.stderr || r.stdout || `exit ${r.code ?? "?"}`; + return fail(check, `${label} failed (exit ${r.code ?? "?"}): ${detail}`); +} + +const RUNNERS: Record< + CheckKind, + (check: AcceptanceCheck, ctx: ExecCtx) => Promise +> = { + build: (c, ctx) => + runShellExit(c, ctx, str(c.spec, "command", "npm run build"), "build"), + + typecheck: (c, ctx) => + runShellExit( + c, + ctx, + str(c.spec, "command", "npx tsc --noEmit"), + "typecheck", + ), + + test: (c, ctx) => + runShellExit(c, ctx, str(c.spec, "command", "npm test"), "tests"), + + data: (c, ctx) => + runShellExit(c, ctx, str(c.spec, "command", ""), "data check"), + + server_up: async (c, ctx) => { + const raw = await ctx.exec("dev_server_start", { + projectId: ctx.projectId, + command: str(c.spec, "command", "npm run dev"), + port: num(c.spec, "port", 3000), + }); + const r = parseToolResult(raw); + if (r.status === 200) return ok(c, "dev server returned 200"); + return fail( + c, + `dev server not healthy (status ${r.status ?? "none"}): ${ + r.stderr || r.stdout || r.raw + }`, + ); + }, + + route_ok: async (c, ctx) => { + const url = str(c.spec, "url"); + const expected = num(c.spec, "expectedStatus", 200); + if (!url) return fail(c, "route_ok check is missing a url"); + const raw = await ctx.exec("shell_exec", { + projectId: ctx.projectId, + command: `curl -s -o /dev/null -w "%{http_code}" --max-time 20 ${JSON.stringify( + url, + )}`, + }); + const r = parseToolResult(raw); + const codeStr = (r.stdout || r.raw).trim().match(/\d{3}/)?.[0]; + if (codeStr && Number(codeStr) === expected) + return ok(c, `${url} → ${codeStr}`); + return fail(c, `${url} returned ${codeStr ?? "no response"} (expected ${expected})`); + }, + + console_clean: async (c, ctx) => { + const url = str(c.spec, "url", ctx.previewUrl ?? ""); + if (!url) return fail(c, "console_clean check is missing a url"); + const raw = await ctx.exec("browser_console", { url }); + const r = parseToolResult(raw); + const text = (r.raw || "").toLowerCase(); + // Look for error-level console output or framework error overlays. + const errorHit = + /"type"\s*:\s*"error"/.test(text) || + /\berror\b[^"]{0,40}(overlay|boundary|uncaught|unhandled)/.test(text) || + /failed to compile|module not found|referenceerror|typeerror:/.test(text); + if (!errorHit) return ok(c, "no console errors"); + return fail(c, `console errors on ${url}: ${clip(r.raw, 240)}`); + }, + + content: async (c, ctx) => { + const url = str(c.spec, "url", ctx.previewUrl ?? ""); + const needle = str(c.spec, "contains"); + if (!url || !needle) + return fail(c, "content check requires both `url` and `contains`"); + const raw = await ctx.exec("shell_exec", { + projectId: ctx.projectId, + command: `curl -s --max-time 20 ${JSON.stringify(url)}`, + }); + const r = parseToolResult(raw); + const body = r.stdout || r.raw; + if (body.includes(needle)) return ok(c, `found "${needle}"`); + return fail(c, `"${needle}" not found on ${url}`); + }, + + flow: async (c, ctx) => { + // A basic journey assertion: navigate to startUrl, then assert the page + // body contains `expectContains` (or that a follow URL is reachable). + const startUrl = str(c.spec, "startUrl", ctx.previewUrl ?? ""); + const expectContains = str(c.spec, "expectContains"); + if (!startUrl) return fail(c, "flow check is missing a startUrl"); + const raw = await ctx.exec("browser_navigate", { url: startUrl }); + const r = parseToolResult(raw); + const body = (r.stdout || r.raw).toString(); + if (expectContains && !body.includes(expectContains)) + return fail(c, `flow on ${startUrl}: did not reach "${expectContains}"`); + if (/error|cannot|failed/i.test(body) && !expectContains) + return fail(c, `flow on ${startUrl} hit an error page`); + return ok(c, `flow reached expected state`); + }, + + visual: async (c, ctx) => { + const targetPath = str(c.spec, "targetPath"); + if (!targetPath) return fail(c, "visual check is missing a targetPath"); + const raw = await ctx.exec("request_visual_qa", { + projectId: ctx.projectId, + targetPath, + }); + const r = parseToolResult(raw); + const obj = r.obj as { score?: number; passed?: boolean } | null; + const threshold = num(c.spec, "minScore", 7); + if (obj?.passed === true) return ok(c, "visual QA passed"); + if (typeof obj?.score === "number") + return obj.score >= threshold + ? ok(c, `visual QA score ${obj.score}`) + : fail(c, `visual QA score ${obj.score} < ${threshold}`); + // No structured score — treat as advisory pass (soft checks won't block). + return ok(c, "visual QA ran (no numeric score)"); + }, +}; + +export async function runCheck( + check: AcceptanceCheck, + ctx: ExecCtx, +): Promise { + const runner = RUNNERS[check.kind]; + if (!runner) return fail(check, `unknown check kind: ${check.kind}`); + try { + return await runner(check, ctx); + } catch (e) { + return fail( + check, + `check runner errored: ${e instanceof Error ? e.message : String(e)}`, + ); + } +} diff --git a/vibn-frontend/lib/ai/verification/types.ts b/vibn-frontend/lib/ai/verification/types.ts new file mode 100644 index 00000000..111c0d10 --- /dev/null +++ b/vibn-frontend/lib/ai/verification/types.ts @@ -0,0 +1,71 @@ +/** + * Acceptance / Verification layer — types. + * + * A task is NOT "done" because the model stops calling tools; it is done when + * its Verification Contract passes. The contract is a small list of + * deterministic, machine-runnable checks attached to the task. + */ + +export type CheckKind = + | "build" // code compiles (npm run build) + | "typecheck" // no type errors (tsc --noEmit) + | "test" // unit/integration tests pass (npm test) + | "server_up" // dev server boots and returns 200 + | "route_ok" // a route/endpoint returns the expected status code + | "console_clean" // no runtime JS console errors on a page + | "content" // expected text/element present on a page + | "flow" // a user journey works (navigate + assert) + | "visual" // UI meets a design rubric (request_visual_qa) + | "data"; // seed/records exist (a query returns expected rows) + +export interface AcceptanceCheck { + kind: CheckKind; + /** Hard checks gate "done". Soft checks are advisory and never block. */ + hard: boolean; + /** Human-readable description shown in build-health reports. */ + description: string; + /** Kind-specific parameters (command, url, expectedStatus, etc.). */ + spec: Record; +} + +export interface CheckResult { + check: AcceptanceCheck; + pass: boolean; + /** Redacted, truncated evidence — fed back to the model on failure. */ + evidence: string; +} + +export interface VerificationReport { + /** True only when every HARD check passed. */ + passed: boolean; + results: CheckResult[]; + /** Hard failures only — these are what the model must fix. */ + failures: CheckResult[]; +} + +export interface VerificationTask { + id: string; + title: string; + status: "open" | "in_progress" | "done" | "blocked"; + acceptanceChecks: AcceptanceCheck[]; + attempts: number; + lastFailures?: CheckResult[]; + /** Tasks that can't be objectively verified (e.g. "make copy friendlier"). */ + requiresHumanConfirm?: boolean; +} + +/** + * Abstraction over the agent's tool execution. Returns the raw tool result + * string (usually JSON). Injecting this makes every runner unit-testable. + */ +export type ToolExecutor = ( + name: string, + args: Record, +) => Promise; + +export interface ExecCtx { + projectId: string; + /** Preview URL of the running dev server, when known. */ + previewUrl?: string; + exec: ToolExecutor; +} diff --git a/vibn-frontend/lib/ai/verification/verification.test.ts b/vibn-frontend/lib/ai/verification/verification.test.ts new file mode 100644 index 00000000..7f29a39d --- /dev/null +++ b/vibn-frontend/lib/ai/verification/verification.test.ts @@ -0,0 +1,365 @@ +import { describe, it, expect, vi } from "vitest"; +import { parseToolResult, runCheck, clip, redact } from "./runners"; +import { withBaseline, runVerificationContract } from "./harness"; +import { + normalizeAcceptanceChecks, + formatFailureFeedback, + failureSignature, +} from "./generation"; +import { executeTask } from "./executor"; +import type { + AcceptanceCheck, + ExecCtx, + ToolExecutor, + VerificationReport, + VerificationTask, +} from "./types"; + +// A mock tool executor: maps a tool name to a canned raw result string. +function mockExec(map: Record): ToolExecutor { + return async (name: string) => map[name] ?? "{}"; +} +function ctx(map: Record, previewUrl?: string): ExecCtx { + return { projectId: "p1", previewUrl, exec: mockExec(map) }; +} +const check = ( + kind: AcceptanceCheck["kind"], + spec: Record = {}, + hard = true, +): AcceptanceCheck => ({ kind, hard, description: kind, spec }); + +const task = (checks: AcceptanceCheck[]): VerificationTask => ({ + id: "t1", + title: "Test task", + status: "open", + acceptanceChecks: checks, + attempts: 0, +}); + +// ── parsing ────────────────────────────────────────────────────────────── + +describe("parseToolResult", () => { + it("extracts code from a shell result", () => { + const r = parseToolResult( + JSON.stringify({ code: 1, stdout: "", stderr: "boom" }), + ); + expect(r.code).toBe(1); + expect(r.stderr).toBe("boom"); + }); + it("unwraps a double-nested stdout JSON payload", () => { + const r = parseToolResult( + JSON.stringify({ stdout: JSON.stringify({ code: 0, stdout: "ok" }) }), + ); + expect(r.code).toBe(0); + expect(r.stdout).toBe("ok"); + }); + it("reads healthCheck.status for server checks", () => { + const r = parseToolResult( + JSON.stringify({ previewUrl: "x", healthCheck: { status: 200 } }), + ); + expect(r.status).toBe(200); + }); + it("survives non-JSON", () => { + const r = parseToolResult("not json"); + expect(r.code).toBeNull(); + expect(r.raw).toBe("not json"); + }); +}); + +describe("redaction", () => { + it("redacts db urls and jwts and long secrets", () => { + const s = redact( + "db postgresql://u:p4ssword@host:5432/mydb token eyJhbGciOiJIUzI1.eyJzdWIiOjEy.SflKxwRJSMeKKF secret sk_live_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789xyz", + ); + expect(s).toContain("[REDACTED_DB_URL]"); + expect(s).toContain("[REDACTED_JWT]"); + expect(s).toContain("[REDACTED_SECRET]"); + }); + it("clip truncates and trims long non-secret text", () => { + const long = "the quick brown fox jumps over the lazy dog. ".repeat(20); + expect(clip(long).endsWith("…")).toBe(true); + expect(clip(long).length).toBeLessThanOrEqual(401); + }); +}); + +// ── runners ────────────────────────────────────────────────────────────── + +describe("runners", () => { + it("build passes on exit 0, fails on non-zero with stderr", async () => { + const pass = await runCheck( + check("build"), + ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "done" }) }), + ); + expect(pass.pass).toBe(true); + + const failR = await runCheck( + check("build"), + ctx({ + shell_exec: JSON.stringify({ + code: 1, + stderr: "Type error on auth.ts:14", + }), + }), + ); + expect(failR.pass).toBe(false); + expect(failR.evidence).toContain("auth.ts:14"); + }); + + it("server_up passes on 200, fails otherwise", async () => { + const pass = await runCheck( + check("server_up"), + ctx({ + dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }), + }), + ); + expect(pass.pass).toBe(true); + + const failR = await runCheck( + check("server_up"), + ctx({ + dev_server_start: JSON.stringify({ healthCheck: { status: 502 } }), + }), + ); + expect(failR.pass).toBe(false); + }); + + it("route_ok matches the expected status code", async () => { + const pass = await runCheck( + check("route_ok", { url: "http://x/dashboard", expectedStatus: 200 }), + ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "200" }) }), + ); + expect(pass.pass).toBe(true); + + const failR = await runCheck( + check("route_ok", { url: "http://x/dashboard" }), + ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "404" }) }), + ); + expect(failR.pass).toBe(false); + expect(failR.evidence).toContain("404"); + }); + + it("console_clean fails when an error is present", async () => { + const failR = await runCheck( + check("console_clean", { url: "http://x" }), + ctx({ + browser_console: JSON.stringify([{ type: "error", text: "boom" }]), + }), + ); + expect(failR.pass).toBe(false); + + const pass = await runCheck( + check("console_clean", { url: "http://x" }), + ctx({ browser_console: JSON.stringify([{ type: "log", text: "ok" }]) }), + ); + expect(pass.pass).toBe(true); + }); + + it("content checks for a substring", async () => { + const pass = await runCheck( + check("content", { url: "http://x", contains: "GetAcquired" }), + ctx({ + shell_exec: JSON.stringify({ code: 0, stdout: "

GetAcquired

" }), + }), + ); + expect(pass.pass).toBe(true); + }); +}); + +// ── harness ────────────────────────────────────────────────────────────── + +describe("harness", () => { + it("auto-attaches the baseline contract", () => { + const merged = withBaseline([], "http://preview"); + const kinds = merged.map((c) => c.kind).sort(); + expect(kinds).toContain("build"); + expect(kinds).toContain("server_up"); + expect(kinds).toContain("console_clean"); + }); + + it("does not duplicate a baseline check the planner already specified", () => { + const merged = withBaseline([check("build")]); + expect(merged.filter((c) => c.kind === "build").length).toBe(1); + }); + + it("reports passed only when all hard checks pass", async () => { + const report = await runVerificationContract( + task([check("route_ok", { url: "http://x/d" })]), + ctx({ + shell_exec: JSON.stringify({ code: 0, stdout: "200" }), + dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }), + browser_console: JSON.stringify([]), + }), + { shortCircuit: false }, + ); + expect(report.passed).toBe(true); + expect(report.failures.length).toBe(0); + }); + + it("short-circuits on the first hard failure", async () => { + const exec = vi.fn(async (name: string) => { + if (name === "shell_exec") + return JSON.stringify({ code: 1, stderr: "build broke" }); + return "{}"; + }); + const report = await runVerificationContract( + task([]), + { projectId: "p1", exec }, + { shortCircuit: true }, + ); + expect(report.passed).toBe(false); + // build is the first hard check; we should NOT have called dev_server_start. + expect(exec).toHaveBeenCalledWith("shell_exec", expect.anything()); + expect(exec).not.toHaveBeenCalledWith( + "dev_server_start", + expect.anything(), + ); + }); + + it("soft check failure does NOT block done", async () => { + const report = await runVerificationContract( + task([check("visual", { targetPath: "x" }, false)]), + ctx({ + shell_exec: JSON.stringify({ code: 0 }), + dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }), + browser_console: JSON.stringify([]), + request_visual_qa: JSON.stringify({ score: 2 }), + }), + { shortCircuit: false }, + ); + // visual scored 2 (would fail) but it's soft → does not block. + expect(report.passed).toBe(true); + }); +}); + +// ── generation ─────────────────────────────────────────────────────────── + +describe("generation", () => { + it("normalizes and caps acceptance checks, dropping unknown kinds", () => { + const out = normalizeAcceptanceChecks([ + { kind: "route_ok", spec: { url: "x" } }, + { kind: "bogus" }, + { kind: "content", spec: { url: "x", contains: "y" } }, + { kind: "build" }, + { kind: "data", spec: { command: "q" } }, + ]); + expect(out.length).toBe(3); // capped + expect(out.find((c) => c.kind === ("bogus" as never))).toBeUndefined(); + }); + + it("defaults visual to a soft check", () => { + const out = normalizeAcceptanceChecks([{ kind: "visual", spec: {} }]); + expect(out[0].hard).toBe(false); + }); + + it("formats actionable failure feedback", () => { + const fb = formatFailureFeedback([ + { + check: check("build"), + pass: false, + evidence: "Cannot find name foo (auth.ts:14)", + }, + ]); + expect(fb).toContain("[VERIFICATION FAILED]"); + expect(fb).toContain("auth.ts:14"); + expect(fb).toContain("Do not claim success"); + }); + + it("failure signatures are stable and order-independent", () => { + const a = failureSignature([ + { check: check("build"), pass: false, evidence: "x" }, + { check: check("route_ok"), pass: false, evidence: "y" }, + ]); + const b = failureSignature([ + { check: check("route_ok"), pass: false, evidence: "y" }, + { check: check("build"), pass: false, evidence: "x" }, + ]); + expect(a).toBe(b); + }); +}); + +// ── executor fix-loop ──────────────────────────────────────────────────── + +const passReport = (): VerificationReport => ({ + passed: true, + results: [], + failures: [], +}); +const failReport = (evidence: string): VerificationReport => ({ + passed: false, + results: [], + failures: [{ check: check("build"), pass: false, evidence }], +}); + +describe("executeTask fix-loop", () => { + it("FINALIZES immediately when the first verify passes", async () => { + const runExecution = vi.fn(async () => {}); + const verify = vi.fn(async () => passReport()); + const out = await executeTask(task([]), { runExecution, verify }); + expect(out.status).toBe("done"); + expect(out.attempts).toBe(1); + expect(runExecution).toHaveBeenCalledTimes(1); + }); + + it("KEEPS FIXING then finalizes when a later attempt passes", async () => { + const verify = vi + .fn() + .mockResolvedValueOnce(failReport("err A")) + .mockResolvedValueOnce(failReport("err B")) // different evidence = progress + .mockResolvedValueOnce(passReport()); + const feedbacks: string[] = []; + const runExecution = vi.fn(async (a: { failureFeedback: string }) => { + feedbacks.push(a.failureFeedback); + }); + const out = await executeTask(task([]), { runExecution, verify }); + expect(out.status).toBe("done"); + expect(out.attempts).toBe(3); + // The 2nd execution received the 1st attempt's concrete failure as context. + expect(feedbacks[1]).toContain("err A"); + }); + + it("ESCALATES (blocked: no_progress) when the same failure repeats", async () => { + const verify = vi.fn(async () => failReport("same error")); + const runExecution = vi.fn(async () => {}); + const out = await executeTask(task([]), { + runExecution, + verify, + noProgressLimit: 2, + }); + expect(out.status).toBe("blocked"); + if (out.status === "blocked") expect(out.reason).toBe("no_progress"); + }); + + it("ESCALATES (blocked: max_attempts) if it never goes green but keeps changing", async () => { + let n = 0; + const verify = vi.fn(async () => failReport(`err ${n++}`)); // always different + const runExecution = vi.fn(async () => {}); + const out = await executeTask(task([]), { + runExecution, + verify, + maxAttempts: 3, + }); + expect(out.status).toBe("blocked"); + if (out.status === "blocked") { + expect(out.reason).toBe("max_attempts"); + expect(out.attempts).toBe(3); + } + }); + + it("persists progress on every attempt (resume support)", async () => { + const verify = vi + .fn() + .mockResolvedValueOnce(failReport("e1")) + .mockResolvedValueOnce(passReport()); + const persisted: number[] = []; + const t = task([]); + await executeTask(t, { + runExecution: async () => {}, + verify, + persist: (tk) => { + persisted.push(tk.attempts); + }, + }); + expect(persisted).toContain(1); // persisted the failing attempt + expect(t.status).toBe("done"); + }); +});