feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.
This commit is contained in:
@@ -28,6 +28,14 @@ import {
|
||||
detectKnownError,
|
||||
formatRecoveryMessage,
|
||||
} from "@/lib/ai/error-recovery";
|
||||
import {
|
||||
executeTask,
|
||||
runVerificationContract,
|
||||
type ExecCtx,
|
||||
type ExecuteTaskOutcome,
|
||||
type ToolExecutor,
|
||||
type VerificationTask,
|
||||
} from "@/lib/ai/verification";
|
||||
|
||||
// --- Agent Orchestration Types & Constants ---
|
||||
type TurnIntent =
|
||||
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
|
||||
);
|
||||
}
|
||||
|
||||
// Scan tool results (most-recent first) for a dev-server preview URL so the
|
||||
// verification layer can run console/route checks against the running app.
|
||||
function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
const m = messages[i];
|
||||
if (m.role !== "tool" || typeof m.content !== "string") continue;
|
||||
if (!m.content.includes("preview")) continue;
|
||||
try {
|
||||
const p = JSON.parse(m.content) as Record<string, unknown>;
|
||||
if (typeof p.previewUrl === "string") return p.previewUrl;
|
||||
if (typeof p.stdout === "string") {
|
||||
try {
|
||||
const inner = JSON.parse(p.stdout) as Record<string, unknown>;
|
||||
if (typeof inner.previewUrl === "string") return inner.previewUrl;
|
||||
} catch {
|
||||
/* stdout not JSON */
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
/* not JSON */
|
||||
}
|
||||
const match = m.content.match(
|
||||
/https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
|
||||
);
|
||||
if (match) return match[0];
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export async function POST(request: Request) {
|
||||
await ensureChatTables();
|
||||
|
||||
@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
|
||||
let phase: AgentPhase = "recon";
|
||||
let checkpointEmitted = false;
|
||||
let verificationPassed = false;
|
||||
// When C-08 forces a "Phase Checkpoint" before a mutation, the model's
|
||||
// next reply is that internal planning block. We route it to the
|
||||
// (hidden) thinking channel instead of showing the user a wall of
|
||||
// Goal/Findings/Suspected-Cause text.
|
||||
let suppressNextTextAsCheckpoint = false;
|
||||
|
||||
// ── Server-side conversational guard (C-03 enforcement) ───────────
|
||||
// If the user's message looks conversational we withhold tools for
|
||||
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
|
||||
let fileHashes = new Map<string, string>();
|
||||
let stallRounds = 0;
|
||||
|
||||
// Compact corrective executor used by the verification fix-loop: runs up
|
||||
// to `n` model rounds (with tools) to fix whatever verification flagged,
|
||||
// reusing the same tool-execution path as the main loop.
|
||||
async function runFixRounds(n: number) {
|
||||
for (let i = 0; i < n; i++) {
|
||||
if (aborted) break;
|
||||
const fixTools = activeMcpToken
|
||||
? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
|
||||
: [];
|
||||
const r = await callVibnChat({
|
||||
systemPrompt,
|
||||
messages,
|
||||
tools: fixTools,
|
||||
temperature: 0.4,
|
||||
includeThoughts: true,
|
||||
});
|
||||
if (r.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + r.text;
|
||||
assistantTextSegments.push(r.text);
|
||||
emit({ type: "text", text: r.text });
|
||||
}
|
||||
messages.push({
|
||||
role: "assistant",
|
||||
content: r.text,
|
||||
toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
|
||||
});
|
||||
if (!r.toolCalls.length) break;
|
||||
for (const tc of r.toolCalls) {
|
||||
if (aborted) break;
|
||||
assistantToolCalls.push(tc);
|
||||
emit({ type: "tool_start", name: tc.name, args: tc.args });
|
||||
const result = activeMcpToken
|
||||
? await executeMcpTool(
|
||||
tc.name,
|
||||
tc.args,
|
||||
activeMcpToken,
|
||||
baseUrl,
|
||||
activeProject?.id,
|
||||
)
|
||||
: JSON.stringify({ error: "No MCP token" });
|
||||
emit({
|
||||
type: "tool_result",
|
||||
name: tc.name,
|
||||
result: result.slice(0, 500),
|
||||
});
|
||||
messages.push({
|
||||
role: "tool",
|
||||
content: result,
|
||||
toolCallId: tc.id,
|
||||
toolName: tc.name,
|
||||
thoughtSignature: tc.thoughtSignature,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
emit({ type: "phase", phase, label: "Investigating & Planning" });
|
||||
|
||||
try {
|
||||
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
|
||||
findings: "Evaluating...",
|
||||
});
|
||||
checkpointEmitted = true;
|
||||
suppressNextTextAsCheckpoint = true;
|
||||
phase = "execute";
|
||||
emit({ type: "phase", phase, label: "Executing Code Edits" });
|
||||
continue; // Skip tool execution and re-prompt
|
||||
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Stream user-facing text to client
|
||||
if (resp.text) {
|
||||
// Stream user-facing text to client.
|
||||
// If this round's text is the forced Phase Checkpoint, route it to
|
||||
// the hidden thinking channel and DON'T add it to the user-facing
|
||||
// message (so it never shows live or in the persisted thread).
|
||||
if (resp.text && suppressNextTextAsCheckpoint) {
|
||||
emit({ type: "thinking", text: resp.text });
|
||||
suppressNextTextAsCheckpoint = false;
|
||||
} else if (resp.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + resp.text;
|
||||
assistantTextSegments.push(resp.text);
|
||||
emit({ type: "text", text: resp.text });
|
||||
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
|
||||
emit({ type: "aborted" });
|
||||
}
|
||||
|
||||
// ── Acceptance verification + corrective fix-loop (flag-gated) ──
|
||||
// After a turn that mutated code, run the verification contract
|
||||
// (baseline: build + server_up + console_clean). If it fails, feed the
|
||||
// concrete failures back and let the model fix — iterating until green,
|
||||
// stuck, or out of attempts. Off by default; enable per-environment
|
||||
// with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
|
||||
let verificationOutcome: ExecuteTaskOutcome | null = null;
|
||||
const MUTATION_TOOLS = [
|
||||
"fs_write",
|
||||
"fs_edit",
|
||||
"fs_delete",
|
||||
"apps_deploy",
|
||||
"ship",
|
||||
];
|
||||
const mutated = assistantToolCalls.some((tc) =>
|
||||
MUTATION_TOOLS.includes(tc.name),
|
||||
);
|
||||
if (
|
||||
process.env.VIBN_VERIFICATION_ENABLED === "1" &&
|
||||
!aborted &&
|
||||
mutated &&
|
||||
activeProject?.id &&
|
||||
activeMcpToken
|
||||
) {
|
||||
emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
|
||||
const previewUrl = extractPreviewUrl(messages);
|
||||
const verifyExec: ToolExecutor = async (name, args) =>
|
||||
executeMcpTool(
|
||||
name,
|
||||
args,
|
||||
activeMcpToken,
|
||||
baseUrl,
|
||||
activeProject!.id,
|
||||
);
|
||||
const vTask: VerificationTask = {
|
||||
id: thread_id,
|
||||
title: message,
|
||||
status: "in_progress",
|
||||
acceptanceChecks: [],
|
||||
attempts: 0,
|
||||
};
|
||||
const verifyCtx: ExecCtx = {
|
||||
projectId: activeProject.id,
|
||||
previewUrl,
|
||||
exec: verifyExec,
|
||||
};
|
||||
try {
|
||||
verificationOutcome = await executeTask(vTask, {
|
||||
maxAttempts: 3,
|
||||
runExecution: async ({ failureFeedback, attempt }) => {
|
||||
// Attempt 1 = verify what the main loop already produced.
|
||||
if (attempt === 1 && !failureFeedback) return;
|
||||
if (failureFeedback)
|
||||
messages.push({ role: "user", content: failureFeedback });
|
||||
await runFixRounds(2);
|
||||
},
|
||||
verify: async () => runVerificationContract(vTask, verifyCtx),
|
||||
});
|
||||
} catch (e) {
|
||||
console.error("[Verification] errored:", e);
|
||||
}
|
||||
// If verification couldn't reach green, surface the specific failing
|
||||
// checks as an honest status (and let the summary reflect reality).
|
||||
if (verificationOutcome?.status === "blocked") {
|
||||
const checkLines = verificationOutcome.failures
|
||||
.map((f) => `- ${f.check.description}: ${f.evidence}`)
|
||||
.join("\n");
|
||||
const note =
|
||||
`I made the changes but verification didn't fully pass:\n${checkLines}\n` +
|
||||
`That's the honest state — want me to keep working these specific issues?`;
|
||||
assistantText += (assistantText ? "\n\n" : "") + note;
|
||||
assistantTextSegments.push(note);
|
||||
emit({ type: "text", text: note });
|
||||
}
|
||||
}
|
||||
|
||||
// If the loop ended with the user staring at a tool tray and no
|
||||
// narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
|
||||
// detected loop, or the model voluntarily stopped emitting tools
|
||||
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
|
||||
assistantTextSegments.push(fallback);
|
||||
emit({ type: "text", text: fallback });
|
||||
}
|
||||
} else if (!aborted && anyToolsExecuted) {
|
||||
// Successful tool-using turn — guarantee it ENDS with a clean,
|
||||
// human summary. We only force one when the model didn't already
|
||||
// close with a substantive sentence, so we never pay for a
|
||||
// redundant double-summary.
|
||||
const lastSeg = (
|
||||
assistantTextSegments[assistantTextSegments.length - 1] || ""
|
||||
).trim();
|
||||
const alreadySummarized =
|
||||
lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
|
||||
if (!alreadySummarized) {
|
||||
try {
|
||||
const finalSummary = await callVibnChat({
|
||||
systemPrompt:
|
||||
systemPrompt +
|
||||
`\n\n[FINAL SUMMARY] The work for this turn is finished. In 1–3 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
|
||||
messages,
|
||||
tools: [],
|
||||
temperature: 0.3,
|
||||
});
|
||||
if (finalSummary.text && finalSummary.text.trim()) {
|
||||
assistantText +=
|
||||
(assistantText ? "\n\n" : "") + finalSummary.text;
|
||||
assistantTextSegments.push(finalSummary.text);
|
||||
emit({ type: "text", text: finalSummary.text });
|
||||
}
|
||||
} catch {
|
||||
// Best-effort: the model's own final text remains as the ending.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Last-resort guard: the model produced NO user-facing text and NO
|
||||
|
||||
@@ -551,9 +551,17 @@ function ThinkingBubble({ thoughts }: { thoughts: string }) {
|
||||
|
||||
function stripRawToolLogs(text: string): string {
|
||||
if (!text) return text;
|
||||
return text
|
||||
.replace(/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g, "")
|
||||
.trim();
|
||||
let out = text.replace(
|
||||
/(?:\r?\n)*\[tools executed this turn:[\s\S]*?\]/g,
|
||||
"",
|
||||
);
|
||||
// Safety net: strip the internal "Phase Checkpoint" planning block
|
||||
// (Goal / Current Findings / Suspected Cause / Verification Plan) if it
|
||||
// ever reaches a user-facing message. This is loop-control machinery, not
|
||||
// something the end user should read. We drop from the heading to the end
|
||||
// of that block (until a blank line followed by non-bulleted prose, or EOF).
|
||||
out = out.replace(/(?:^|\n)\s*#{0,3}\s*Phase Checkpoint[\s\S]*$/i, "").trim();
|
||||
return out.trim();
|
||||
}
|
||||
|
||||
const MessageBubble = React.memo(function MessageBubble({
|
||||
@@ -748,32 +756,8 @@ function Timeline({ entries }: { entries: TimelineEntry[] }) {
|
||||
);
|
||||
}
|
||||
if (item.kind === "checkpoint") {
|
||||
return (
|
||||
<div
|
||||
key={i}
|
||||
style={{
|
||||
margin: "6px 0 12px",
|
||||
padding: "12px 14px",
|
||||
background: "oklch(0.20 0.04 35 / 0.15)",
|
||||
border: "1px dashed var(--accent)",
|
||||
borderRadius: 8,
|
||||
fontSize: "0.75rem",
|
||||
color: "var(--fg-mute)",
|
||||
fontFamily: "var(--font-mono), monospace",
|
||||
}}
|
||||
>
|
||||
<div
|
||||
style={{
|
||||
color: "var(--accent)",
|
||||
fontWeight: "bold",
|
||||
marginBottom: 4,
|
||||
}}
|
||||
>
|
||||
[Checkpoint Logged]
|
||||
</div>
|
||||
<div style={{ opacity: 0.8 }}>{item.goal}</div>
|
||||
</div>
|
||||
);
|
||||
// Internal loop-control machinery — never shown to the user.
|
||||
return null;
|
||||
}
|
||||
return (
|
||||
<TimelineToolGroup
|
||||
|
||||
127
lib/ai/verification/executor.ts
Normal file
127
lib/ai/verification/executor.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
/**
|
||||
* Task executor — the iterate-to-green loop.
|
||||
*
|
||||
* EXECUTE (model edits toward the goal, with prior failures as context)
|
||||
* → TEST (run the verification contract)
|
||||
* → pass? → FINALIZE (task done)
|
||||
* → fail? → KEEP FIXING (feed concrete failures back)
|
||||
* → stuck? → ESCALATE (re-plan or honest blocker to the user)
|
||||
*
|
||||
* This module is pure orchestration over injected dependencies, so the
|
||||
* finalize / keep-fixing / escalate decisions are fully unit-testable without
|
||||
* a live dev container.
|
||||
*/
|
||||
|
||||
import type { CheckResult, VerificationReport, VerificationTask } from "./types";
|
||||
import { failureSignature, formatFailureFeedback } from "./generation";
|
||||
|
||||
export interface ExecuteTaskDeps {
|
||||
/**
|
||||
* Run one execution pass: let the model make edits toward the task's goal.
|
||||
* `failureFeedback` is the structured "[VERIFICATION FAILED] …" message from
|
||||
* the previous attempt (empty on the first attempt).
|
||||
*/
|
||||
runExecution: (args: {
|
||||
task: VerificationTask;
|
||||
failureFeedback: string;
|
||||
attempt: number;
|
||||
}) => Promise<void>;
|
||||
|
||||
/** Run the verification contract and return a structured report. */
|
||||
verify: (task: VerificationTask) => Promise<VerificationReport>;
|
||||
|
||||
/** Persist task progress (attempts + lastFailures) so a turn can resume. */
|
||||
persist?: (task: VerificationTask) => void | Promise<void>;
|
||||
|
||||
/** Max execute→verify cycles before escalating. Default 5. */
|
||||
maxAttempts?: number;
|
||||
|
||||
/** Stop after this many consecutive no-progress attempts. Default 2. */
|
||||
noProgressLimit?: number;
|
||||
}
|
||||
|
||||
export type ExecuteTaskOutcome =
|
||||
| { status: "done"; report: VerificationReport; attempts: number }
|
||||
| {
|
||||
status: "blocked";
|
||||
report: VerificationReport | null;
|
||||
attempts: number;
|
||||
reason: string;
|
||||
failures: CheckResult[];
|
||||
};
|
||||
|
||||
export async function executeTask(
|
||||
task: VerificationTask,
|
||||
deps: ExecuteTaskDeps,
|
||||
): Promise<ExecuteTaskOutcome> {
|
||||
const maxAttempts = deps.maxAttempts ?? 5;
|
||||
const noProgressLimit = deps.noProgressLimit ?? 2;
|
||||
|
||||
task.status = "in_progress";
|
||||
let prevSig: string | null = null;
|
||||
let noProgressStreak = 0;
|
||||
let lastReport: VerificationReport | null = null;
|
||||
|
||||
while (task.attempts < maxAttempts) {
|
||||
task.attempts++;
|
||||
|
||||
// EXECUTE — with the prior failures fed back as concrete instructions.
|
||||
const failureFeedback = task.lastFailures?.length
|
||||
? formatFailureFeedback(task.lastFailures)
|
||||
: "";
|
||||
await deps.runExecution({
|
||||
task,
|
||||
failureFeedback,
|
||||
attempt: task.attempts,
|
||||
});
|
||||
|
||||
// TEST
|
||||
const report = await deps.verify(task);
|
||||
lastReport = report;
|
||||
|
||||
if (report.passed) {
|
||||
// FINALIZE
|
||||
task.status = "done";
|
||||
task.lastFailures = [];
|
||||
await deps.persist?.(task);
|
||||
return { status: "done", report, attempts: task.attempts };
|
||||
}
|
||||
|
||||
// KEEP FIXING — persist the concrete failures so the next attempt (even in
|
||||
// a later HTTP turn) resumes with full context.
|
||||
task.lastFailures = report.failures;
|
||||
await deps.persist?.(task);
|
||||
|
||||
// Detect no progress: the same hard failures with the same evidence.
|
||||
const sig = failureSignature(report.failures);
|
||||
if (prevSig !== null && sig === prevSig) {
|
||||
noProgressStreak++;
|
||||
} else {
|
||||
noProgressStreak = 0;
|
||||
}
|
||||
prevSig = sig;
|
||||
|
||||
if (noProgressStreak >= noProgressLimit) {
|
||||
task.status = "blocked";
|
||||
await deps.persist?.(task);
|
||||
return {
|
||||
status: "blocked",
|
||||
report,
|
||||
attempts: task.attempts,
|
||||
reason: "no_progress",
|
||||
failures: report.failures,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Hit the attempt ceiling without going green.
|
||||
task.status = "blocked";
|
||||
await deps.persist?.(task);
|
||||
return {
|
||||
status: "blocked",
|
||||
report: lastReport,
|
||||
attempts: task.attempts,
|
||||
reason: "max_attempts",
|
||||
failures: lastReport?.failures ?? [],
|
||||
};
|
||||
}
|
||||
108
lib/ai/verification/generation.ts
Normal file
108
lib/ai/verification/generation.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
/**
|
||||
* Acceptance-check generation + feedback formatting.
|
||||
*
|
||||
* - The Planner emits `acceptanceChecks` per task using a strict schema; we
|
||||
* validate/normalize that output here (models are not trustworthy emitters).
|
||||
* - On a failed verification we format the failures into concrete, structured
|
||||
* feedback that the next execution round consumes — this is what makes the
|
||||
* model FIX rather than guess.
|
||||
*/
|
||||
|
||||
import type { AcceptanceCheck, CheckKind, CheckResult } from "./types";
|
||||
|
||||
const VALID_KINDS: CheckKind[] = [
|
||||
"build",
|
||||
"typecheck",
|
||||
"test",
|
||||
"server_up",
|
||||
"route_ok",
|
||||
"console_clean",
|
||||
"content",
|
||||
"flow",
|
||||
"visual",
|
||||
"data",
|
||||
];
|
||||
|
||||
// Soft-by-default kinds (advisory, never block "done").
|
||||
const SOFT_KINDS = new Set<CheckKind>(["visual"]);
|
||||
|
||||
/**
|
||||
* Validate and normalize a raw `acceptanceChecks` array from the model.
|
||||
* Drops unknown kinds, coerces missing fields, and caps the count.
|
||||
*/
|
||||
export function normalizeAcceptanceChecks(raw: unknown): AcceptanceCheck[] {
|
||||
if (!Array.isArray(raw)) return [];
|
||||
const out: AcceptanceCheck[] = [];
|
||||
for (const item of raw) {
|
||||
if (!item || typeof item !== "object") continue;
|
||||
const o = item as Record<string, unknown>;
|
||||
const kind = o.kind as CheckKind;
|
||||
if (!VALID_KINDS.includes(kind)) continue;
|
||||
const spec =
|
||||
o.spec && typeof o.spec === "object"
|
||||
? (o.spec as Record<string, unknown>)
|
||||
: {};
|
||||
const hard =
|
||||
typeof o.hard === "boolean" ? o.hard : !SOFT_KINDS.has(kind);
|
||||
const description =
|
||||
typeof o.description === "string" && o.description.trim()
|
||||
? o.description.trim()
|
||||
: kind;
|
||||
out.push({ kind, hard, description, spec });
|
||||
if (out.length >= 3) break; // keep contracts tight (1–3 checks)
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instruction appended to the Planner's system prompt so each task it creates
|
||||
* carries a checkable contract.
|
||||
*/
|
||||
export const CHECK_GENERATION_PROMPT = `
|
||||
[ACCEPTANCE CHECKS] For every task you create, attach \`acceptanceChecks\`: a JSON
|
||||
array of 1–3 checks that objectively prove THIS task is done.
|
||||
Each check: { "kind": <kind>, "hard": <bool>, "description": <string>, "spec": { ... } }
|
||||
Allowed kinds and their spec:
|
||||
- build spec: {} (compiles)
|
||||
- typecheck spec: {} (no type errors)
|
||||
- test spec: { command?: string } (tests pass)
|
||||
- server_up spec: { port?: number } (app boots, 200)
|
||||
- route_ok spec: { url: string, expectedStatus?: number }
|
||||
- console_clean spec: { url?: string } (no JS errors)
|
||||
- content spec: { url: string, contains: string } (text present)
|
||||
- flow spec: { startUrl: string, expectContains: string }
|
||||
- visual spec: { targetPath: string, minScore?: number } (soft)
|
||||
- data spec: { command: string } (records exist)
|
||||
Rules:
|
||||
- build + server_up + console_clean are added AUTOMATICALLY. Do NOT repeat them.
|
||||
- Add only checks that prove THIS task's specific behavior.
|
||||
- Prefer the cheapest proof: route_ok/content over flow, flow over visual.
|
||||
- If a task is not objectively verifiable (e.g. "make the copy friendlier"),
|
||||
return an empty acceptanceChecks array and set "requiresHumanConfirm": true.
|
||||
Do NOT fabricate a check you cannot actually verify.
|
||||
`.trim();
|
||||
|
||||
/**
|
||||
* Turn hard failures into specific, actionable feedback for the next execution
|
||||
* round. Not "it didn't work" — the exact check, evidence, and a directive.
|
||||
*/
|
||||
export function formatFailureFeedback(failures: CheckResult[]): string {
|
||||
if (!failures.length) return "";
|
||||
const lines = failures.map(
|
||||
(f) => `- ${f.check.kind} (${f.check.description}): FAILED — ${f.evidence}`,
|
||||
);
|
||||
return (
|
||||
"[VERIFICATION FAILED] Your last changes did not pass these checks:\n" +
|
||||
lines.join("\n") +
|
||||
"\nFix these specific failures. Do not claim success until every check passes. " +
|
||||
"Address the exact errors above — read the relevant files first if needed."
|
||||
);
|
||||
}
|
||||
|
||||
/** Stable signature of a report's hard failures — used to detect no-progress. */
|
||||
export function failureSignature(failures: CheckResult[]): string {
|
||||
return failures
|
||||
.map((f) => `${f.check.kind}:${f.evidence}`)
|
||||
.sort()
|
||||
.join(";;");
|
||||
}
|
||||
92
lib/ai/verification/harness.ts
Normal file
92
lib/ai/verification/harness.ts
Normal file
@@ -0,0 +1,92 @@
|
||||
/**
|
||||
* Verification harness — runs a task's contract and returns a structured
|
||||
* pass/fail report. This is the single source of truth for "is the task done".
|
||||
*/
|
||||
|
||||
import type {
|
||||
AcceptanceCheck,
|
||||
CheckResult,
|
||||
ExecCtx,
|
||||
VerificationReport,
|
||||
VerificationTask,
|
||||
} from "./types";
|
||||
import { runCheck } from "./runners";
|
||||
|
||||
/**
|
||||
* The baseline contract auto-attached to every code task. Even if the Planner
|
||||
* specifies no checks, a task can never be "done" while the app fails to build
|
||||
* or the page throws — this is the floor that kills false-completion
|
||||
* ("I scaffolded everything ✓" when nothing compiles).
|
||||
*/
|
||||
export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
|
||||
const checks: AcceptanceCheck[] = [
|
||||
{
|
||||
kind: "build",
|
||||
hard: true,
|
||||
description: "Project builds without errors",
|
||||
spec: {},
|
||||
},
|
||||
{
|
||||
kind: "server_up",
|
||||
hard: true,
|
||||
description: "Dev server boots and responds 200",
|
||||
spec: { port: 3000 },
|
||||
},
|
||||
];
|
||||
// console_clean needs a URL to check. Only include it when we actually know
|
||||
// the preview URL — otherwise we'd fail the whole contract on an un-runnable
|
||||
// check. (When run inside the agent, the URL comes from dev_server_start.)
|
||||
if (previewUrl) {
|
||||
checks.push({
|
||||
kind: "console_clean",
|
||||
hard: true,
|
||||
description: "Preview has no runtime console errors",
|
||||
spec: { url: previewUrl },
|
||||
});
|
||||
}
|
||||
return checks;
|
||||
}
|
||||
|
||||
const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;
|
||||
|
||||
/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
|
||||
export function withBaseline(
|
||||
checks: AcceptanceCheck[],
|
||||
previewUrl?: string,
|
||||
): AcceptanceCheck[] {
|
||||
const seen = new Set(checks.map(KEY));
|
||||
const merged = [...checks];
|
||||
for (const b of baselineChecks(previewUrl)) {
|
||||
if (!seen.has(KEY(b))) merged.push(b);
|
||||
}
|
||||
// Run hard checks first so we short-circuit on the cheapest objective failure.
|
||||
return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
|
||||
}
|
||||
|
||||
export interface RunContractOptions {
|
||||
/** Skip the auto-baseline (e.g. for a pure data/research task). */
|
||||
noBaseline?: boolean;
|
||||
/** Stop after the first HARD failure (cheaper). Default true. */
|
||||
shortCircuit?: boolean;
|
||||
}
|
||||
|
||||
export async function runVerificationContract(
|
||||
task: VerificationTask,
|
||||
ctx: ExecCtx,
|
||||
opts: RunContractOptions = {},
|
||||
): Promise<VerificationReport> {
|
||||
const { noBaseline = false, shortCircuit = true } = opts;
|
||||
const checks = noBaseline
|
||||
? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
|
||||
: withBaseline(task.acceptanceChecks, ctx.previewUrl);
|
||||
|
||||
const results: CheckResult[] = [];
|
||||
for (const check of checks) {
|
||||
const r = await runCheck(check, ctx);
|
||||
results.push(r);
|
||||
if (shortCircuit && !r.pass && check.hard) break;
|
||||
}
|
||||
|
||||
const failures = results.filter((r) => !r.pass && r.check.hard);
|
||||
return { passed: failures.length === 0, results, failures };
|
||||
}
|
||||
5
lib/ai/verification/index.ts
Normal file
5
lib/ai/verification/index.ts
Normal file
@@ -0,0 +1,5 @@
|
||||
export * from "./types";
|
||||
export * from "./runners";
|
||||
export * from "./harness";
|
||||
export * from "./generation";
|
||||
export * from "./executor";
|
||||
269
lib/ai/verification/runners.ts
Normal file
269
lib/ai/verification/runners.ts
Normal file
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* Acceptance check runners.
|
||||
*
|
||||
* Each runner maps a single AcceptanceCheck to a deterministic tool invocation
|
||||
* and returns a structured { pass, evidence }. Runners depend only on the
|
||||
* injected ToolExecutor, so they are fully unit-testable with mocked outputs.
|
||||
*/
|
||||
|
||||
import type {
|
||||
AcceptanceCheck,
|
||||
CheckKind,
|
||||
CheckResult,
|
||||
ExecCtx,
|
||||
} from "./types";
|
||||
|
||||
// ── helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
export function redact(s: string): string {
|
||||
return s
|
||||
.replace(
|
||||
/postgres(?:ql)?:\/\/[^:\s]+:[^@\s]+@[^/\s]+\/[^\s"']+/gi,
|
||||
"postgresql://[REDACTED_DB_URL]",
|
||||
)
|
||||
.replace(
|
||||
/eyJ[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}\.[a-zA-Z0-9_-]{5,}/g,
|
||||
"[REDACTED_JWT]",
|
||||
)
|
||||
.replace(/\b[A-Za-z0-9_-]{40,}\b/g, "[REDACTED_SECRET]");
|
||||
}
|
||||
|
||||
export function clip(s: string, n = 400): string {
|
||||
const out = redact(String(s ?? "").replace(/\s+/g, " ").trim());
|
||||
return out.length > n ? out.slice(0, n) + "…" : out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a raw tool result into a normalized shape. Tool results come back as a
|
||||
* JSON string; shapes vary by tool, so we extract defensively. Some tools
|
||||
* double-wrap (a `stdout` field that is itself JSON) — we unwrap one level.
|
||||
*/
|
||||
export interface ParsedToolResult {
|
||||
code: number | null;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
status: number | null; // healthCheck.status, etc.
|
||||
raw: string;
|
||||
obj: Record<string, unknown> | null;
|
||||
}
|
||||
|
||||
export function parseToolResult(raw: string): ParsedToolResult {
|
||||
const base: ParsedToolResult = {
|
||||
code: null,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
status: null,
|
||||
raw: String(raw ?? ""),
|
||||
obj: null,
|
||||
};
|
||||
let obj: Record<string, unknown> | null = null;
|
||||
try {
|
||||
const p = JSON.parse(base.raw);
|
||||
if (p && typeof p === "object") obj = p as Record<string, unknown>;
|
||||
} catch {
|
||||
return base;
|
||||
}
|
||||
if (!obj) return base;
|
||||
base.obj = obj;
|
||||
|
||||
// Some wrappers nest the real payload under `stdout` as a JSON string.
|
||||
let target = obj;
|
||||
if (
|
||||
typeof obj.stdout === "string" &&
|
||||
obj.code === undefined &&
|
||||
obj.healthCheck === undefined
|
||||
) {
|
||||
try {
|
||||
const inner = JSON.parse(obj.stdout);
|
||||
if (inner && typeof inner === "object")
|
||||
target = inner as Record<string, unknown>;
|
||||
} catch {
|
||||
/* stdout is plain text, keep outer */
|
||||
}
|
||||
}
|
||||
|
||||
if (typeof target.code === "number") base.code = target.code;
|
||||
if (typeof target.exitCode === "number") base.code = target.exitCode;
|
||||
if (typeof target.stdout === "string") base.stdout = target.stdout;
|
||||
if (typeof target.stderr === "string") base.stderr = target.stderr;
|
||||
|
||||
const hc = target.healthCheck as { status?: number } | undefined;
|
||||
if (hc && typeof hc.status === "number") base.status = hc.status;
|
||||
if (typeof target.status === "number") base.status = target.status;
|
||||
|
||||
return base;
|
||||
}
|
||||
|
||||
function ok(check: AcceptanceCheck, evidence = "passed"): CheckResult {
|
||||
return { check, pass: true, evidence: clip(evidence) };
|
||||
}
|
||||
function fail(check: AcceptanceCheck, evidence: string): CheckResult {
|
||||
return { check, pass: false, evidence: clip(evidence) };
|
||||
}
|
||||
|
||||
function str(spec: Record<string, unknown>, key: string, dflt = ""): string {
|
||||
const v = spec[key];
|
||||
return typeof v === "string" ? v : dflt;
|
||||
}
|
||||
function num(
|
||||
spec: Record<string, unknown>,
|
||||
key: string,
|
||||
dflt: number,
|
||||
): number {
|
||||
const v = spec[key];
|
||||
return typeof v === "number" ? v : dflt;
|
||||
}
|
||||
|
||||
// ── runners ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function runShellExit(
|
||||
check: AcceptanceCheck,
|
||||
ctx: ExecCtx,
|
||||
command: string,
|
||||
label: string,
|
||||
): Promise<CheckResult> {
|
||||
const raw = await ctx.exec("shell_exec", {
|
||||
projectId: ctx.projectId,
|
||||
command,
|
||||
});
|
||||
const r = parseToolResult(raw);
|
||||
if (r.code === 0) return ok(check, `${label} passed`);
|
||||
const detail = r.stderr || r.stdout || `exit ${r.code ?? "?"}`;
|
||||
return fail(check, `${label} failed (exit ${r.code ?? "?"}): ${detail}`);
|
||||
}
|
||||
|
||||
const RUNNERS: Record<
|
||||
CheckKind,
|
||||
(check: AcceptanceCheck, ctx: ExecCtx) => Promise<CheckResult>
|
||||
> = {
|
||||
build: (c, ctx) =>
|
||||
runShellExit(c, ctx, str(c.spec, "command", "npm run build"), "build"),
|
||||
|
||||
typecheck: (c, ctx) =>
|
||||
runShellExit(
|
||||
c,
|
||||
ctx,
|
||||
str(c.spec, "command", "npx tsc --noEmit"),
|
||||
"typecheck",
|
||||
),
|
||||
|
||||
test: (c, ctx) =>
|
||||
runShellExit(c, ctx, str(c.spec, "command", "npm test"), "tests"),
|
||||
|
||||
data: (c, ctx) =>
|
||||
runShellExit(c, ctx, str(c.spec, "command", ""), "data check"),
|
||||
|
||||
server_up: async (c, ctx) => {
|
||||
const raw = await ctx.exec("dev_server_start", {
|
||||
projectId: ctx.projectId,
|
||||
command: str(c.spec, "command", "npm run dev"),
|
||||
port: num(c.spec, "port", 3000),
|
||||
});
|
||||
const r = parseToolResult(raw);
|
||||
if (r.status === 200) return ok(c, "dev server returned 200");
|
||||
return fail(
|
||||
c,
|
||||
`dev server not healthy (status ${r.status ?? "none"}): ${
|
||||
r.stderr || r.stdout || r.raw
|
||||
}`,
|
||||
);
|
||||
},
|
||||
|
||||
route_ok: async (c, ctx) => {
|
||||
const url = str(c.spec, "url");
|
||||
const expected = num(c.spec, "expectedStatus", 200);
|
||||
if (!url) return fail(c, "route_ok check is missing a url");
|
||||
const raw = await ctx.exec("shell_exec", {
|
||||
projectId: ctx.projectId,
|
||||
command: `curl -s -o /dev/null -w "%{http_code}" --max-time 20 ${JSON.stringify(
|
||||
url,
|
||||
)}`,
|
||||
});
|
||||
const r = parseToolResult(raw);
|
||||
const codeStr = (r.stdout || r.raw).trim().match(/\d{3}/)?.[0];
|
||||
if (codeStr && Number(codeStr) === expected)
|
||||
return ok(c, `${url} → ${codeStr}`);
|
||||
return fail(c, `${url} returned ${codeStr ?? "no response"} (expected ${expected})`);
|
||||
},
|
||||
|
||||
console_clean: async (c, ctx) => {
|
||||
const url = str(c.spec, "url", ctx.previewUrl ?? "");
|
||||
if (!url) return fail(c, "console_clean check is missing a url");
|
||||
const raw = await ctx.exec("browser_console", { url });
|
||||
const r = parseToolResult(raw);
|
||||
const text = (r.raw || "").toLowerCase();
|
||||
// Look for error-level console output or framework error overlays.
|
||||
const errorHit =
|
||||
/"type"\s*:\s*"error"/.test(text) ||
|
||||
/\berror\b[^"]{0,40}(overlay|boundary|uncaught|unhandled)/.test(text) ||
|
||||
/failed to compile|module not found|referenceerror|typeerror:/.test(text);
|
||||
if (!errorHit) return ok(c, "no console errors");
|
||||
return fail(c, `console errors on ${url}: ${clip(r.raw, 240)}`);
|
||||
},
|
||||
|
||||
content: async (c, ctx) => {
|
||||
const url = str(c.spec, "url", ctx.previewUrl ?? "");
|
||||
const needle = str(c.spec, "contains");
|
||||
if (!url || !needle)
|
||||
return fail(c, "content check requires both `url` and `contains`");
|
||||
const raw = await ctx.exec("shell_exec", {
|
||||
projectId: ctx.projectId,
|
||||
command: `curl -s --max-time 20 ${JSON.stringify(url)}`,
|
||||
});
|
||||
const r = parseToolResult(raw);
|
||||
const body = r.stdout || r.raw;
|
||||
if (body.includes(needle)) return ok(c, `found "${needle}"`);
|
||||
return fail(c, `"${needle}" not found on ${url}`);
|
||||
},
|
||||
|
||||
flow: async (c, ctx) => {
|
||||
// A basic journey assertion: navigate to startUrl, then assert the page
|
||||
// body contains `expectContains` (or that a follow URL is reachable).
|
||||
const startUrl = str(c.spec, "startUrl", ctx.previewUrl ?? "");
|
||||
const expectContains = str(c.spec, "expectContains");
|
||||
if (!startUrl) return fail(c, "flow check is missing a startUrl");
|
||||
const raw = await ctx.exec("browser_navigate", { url: startUrl });
|
||||
const r = parseToolResult(raw);
|
||||
const body = (r.stdout || r.raw).toString();
|
||||
if (expectContains && !body.includes(expectContains))
|
||||
return fail(c, `flow on ${startUrl}: did not reach "${expectContains}"`);
|
||||
if (/error|cannot|failed/i.test(body) && !expectContains)
|
||||
return fail(c, `flow on ${startUrl} hit an error page`);
|
||||
return ok(c, `flow reached expected state`);
|
||||
},
|
||||
|
||||
visual: async (c, ctx) => {
|
||||
const targetPath = str(c.spec, "targetPath");
|
||||
if (!targetPath) return fail(c, "visual check is missing a targetPath");
|
||||
const raw = await ctx.exec("request_visual_qa", {
|
||||
projectId: ctx.projectId,
|
||||
targetPath,
|
||||
});
|
||||
const r = parseToolResult(raw);
|
||||
const obj = r.obj as { score?: number; passed?: boolean } | null;
|
||||
const threshold = num(c.spec, "minScore", 7);
|
||||
if (obj?.passed === true) return ok(c, "visual QA passed");
|
||||
if (typeof obj?.score === "number")
|
||||
return obj.score >= threshold
|
||||
? ok(c, `visual QA score ${obj.score}`)
|
||||
: fail(c, `visual QA score ${obj.score} < ${threshold}`);
|
||||
// No structured score — treat as advisory pass (soft checks won't block).
|
||||
return ok(c, "visual QA ran (no numeric score)");
|
||||
},
|
||||
};
|
||||
|
||||
export async function runCheck(
|
||||
check: AcceptanceCheck,
|
||||
ctx: ExecCtx,
|
||||
): Promise<CheckResult> {
|
||||
const runner = RUNNERS[check.kind];
|
||||
if (!runner) return fail(check, `unknown check kind: ${check.kind}`);
|
||||
try {
|
||||
return await runner(check, ctx);
|
||||
} catch (e) {
|
||||
return fail(
|
||||
check,
|
||||
`check runner errored: ${e instanceof Error ? e.message : String(e)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
71
lib/ai/verification/types.ts
Normal file
71
lib/ai/verification/types.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* Acceptance / Verification layer — types.
|
||||
*
|
||||
* A task is NOT "done" because the model stops calling tools; it is done when
|
||||
* its Verification Contract passes. The contract is a small list of
|
||||
* deterministic, machine-runnable checks attached to the task.
|
||||
*/
|
||||
|
||||
export type CheckKind =
|
||||
| "build" // code compiles (npm run build)
|
||||
| "typecheck" // no type errors (tsc --noEmit)
|
||||
| "test" // unit/integration tests pass (npm test)
|
||||
| "server_up" // dev server boots and returns 200
|
||||
| "route_ok" // a route/endpoint returns the expected status code
|
||||
| "console_clean" // no runtime JS console errors on a page
|
||||
| "content" // expected text/element present on a page
|
||||
| "flow" // a user journey works (navigate + assert)
|
||||
| "visual" // UI meets a design rubric (request_visual_qa)
|
||||
| "data"; // seed/records exist (a query returns expected rows)
|
||||
|
||||
export interface AcceptanceCheck {
|
||||
kind: CheckKind;
|
||||
/** Hard checks gate "done". Soft checks are advisory and never block. */
|
||||
hard: boolean;
|
||||
/** Human-readable description shown in build-health reports. */
|
||||
description: string;
|
||||
/** Kind-specific parameters (command, url, expectedStatus, etc.). */
|
||||
spec: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface CheckResult {
|
||||
check: AcceptanceCheck;
|
||||
pass: boolean;
|
||||
/** Redacted, truncated evidence — fed back to the model on failure. */
|
||||
evidence: string;
|
||||
}
|
||||
|
||||
export interface VerificationReport {
|
||||
/** True only when every HARD check passed. */
|
||||
passed: boolean;
|
||||
results: CheckResult[];
|
||||
/** Hard failures only — these are what the model must fix. */
|
||||
failures: CheckResult[];
|
||||
}
|
||||
|
||||
export interface VerificationTask {
|
||||
id: string;
|
||||
title: string;
|
||||
status: "open" | "in_progress" | "done" | "blocked";
|
||||
acceptanceChecks: AcceptanceCheck[];
|
||||
attempts: number;
|
||||
lastFailures?: CheckResult[];
|
||||
/** Tasks that can't be objectively verified (e.g. "make copy friendlier"). */
|
||||
requiresHumanConfirm?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstraction over the agent's tool execution. Returns the raw tool result
|
||||
* string (usually JSON). Injecting this makes every runner unit-testable.
|
||||
*/
|
||||
export type ToolExecutor = (
|
||||
name: string,
|
||||
args: Record<string, unknown>,
|
||||
) => Promise<string>;
|
||||
|
||||
export interface ExecCtx {
|
||||
projectId: string;
|
||||
/** Preview URL of the running dev server, when known. */
|
||||
previewUrl?: string;
|
||||
exec: ToolExecutor;
|
||||
}
|
||||
365
lib/ai/verification/verification.test.ts
Normal file
365
lib/ai/verification/verification.test.ts
Normal file
@@ -0,0 +1,365 @@
|
||||
import { describe, it, expect, vi } from "vitest";
|
||||
import { parseToolResult, runCheck, clip, redact } from "./runners";
|
||||
import { withBaseline, runVerificationContract } from "./harness";
|
||||
import {
|
||||
normalizeAcceptanceChecks,
|
||||
formatFailureFeedback,
|
||||
failureSignature,
|
||||
} from "./generation";
|
||||
import { executeTask } from "./executor";
|
||||
import type {
|
||||
AcceptanceCheck,
|
||||
ExecCtx,
|
||||
ToolExecutor,
|
||||
VerificationReport,
|
||||
VerificationTask,
|
||||
} from "./types";
|
||||
|
||||
// A mock tool executor: maps a tool name to a canned raw result string.
|
||||
function mockExec(map: Record<string, string>): ToolExecutor {
|
||||
return async (name: string) => map[name] ?? "{}";
|
||||
}
|
||||
function ctx(map: Record<string, string>, previewUrl?: string): ExecCtx {
|
||||
return { projectId: "p1", previewUrl, exec: mockExec(map) };
|
||||
}
|
||||
const check = (
|
||||
kind: AcceptanceCheck["kind"],
|
||||
spec: Record<string, unknown> = {},
|
||||
hard = true,
|
||||
): AcceptanceCheck => ({ kind, hard, description: kind, spec });
|
||||
|
||||
const task = (checks: AcceptanceCheck[]): VerificationTask => ({
|
||||
id: "t1",
|
||||
title: "Test task",
|
||||
status: "open",
|
||||
acceptanceChecks: checks,
|
||||
attempts: 0,
|
||||
});
|
||||
|
||||
// ── parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
describe("parseToolResult", () => {
|
||||
it("extracts code from a shell result", () => {
|
||||
const r = parseToolResult(
|
||||
JSON.stringify({ code: 1, stdout: "", stderr: "boom" }),
|
||||
);
|
||||
expect(r.code).toBe(1);
|
||||
expect(r.stderr).toBe("boom");
|
||||
});
|
||||
it("unwraps a double-nested stdout JSON payload", () => {
|
||||
const r = parseToolResult(
|
||||
JSON.stringify({ stdout: JSON.stringify({ code: 0, stdout: "ok" }) }),
|
||||
);
|
||||
expect(r.code).toBe(0);
|
||||
expect(r.stdout).toBe("ok");
|
||||
});
|
||||
it("reads healthCheck.status for server checks", () => {
|
||||
const r = parseToolResult(
|
||||
JSON.stringify({ previewUrl: "x", healthCheck: { status: 200 } }),
|
||||
);
|
||||
expect(r.status).toBe(200);
|
||||
});
|
||||
it("survives non-JSON", () => {
|
||||
const r = parseToolResult("not json");
|
||||
expect(r.code).toBeNull();
|
||||
expect(r.raw).toBe("not json");
|
||||
});
|
||||
});
|
||||
|
||||
describe("redaction", () => {
|
||||
it("redacts db urls and jwts and long secrets", () => {
|
||||
const s = redact(
|
||||
"db postgresql://u:p4ssword@host:5432/mydb token eyJhbGciOiJIUzI1.eyJzdWIiOjEy.SflKxwRJSMeKKF secret sk_live_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789xyz",
|
||||
);
|
||||
expect(s).toContain("[REDACTED_DB_URL]");
|
||||
expect(s).toContain("[REDACTED_JWT]");
|
||||
expect(s).toContain("[REDACTED_SECRET]");
|
||||
});
|
||||
it("clip truncates and trims long non-secret text", () => {
|
||||
const long = "the quick brown fox jumps over the lazy dog. ".repeat(20);
|
||||
expect(clip(long).endsWith("…")).toBe(true);
|
||||
expect(clip(long).length).toBeLessThanOrEqual(401);
|
||||
});
|
||||
});
|
||||
|
||||
// ── runners ──────────────────────────────────────────────────────────────
|
||||
|
||||
describe("runners", () => {
|
||||
it("build passes on exit 0, fails on non-zero with stderr", async () => {
|
||||
const pass = await runCheck(
|
||||
check("build"),
|
||||
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "done" }) }),
|
||||
);
|
||||
expect(pass.pass).toBe(true);
|
||||
|
||||
const failR = await runCheck(
|
||||
check("build"),
|
||||
ctx({
|
||||
shell_exec: JSON.stringify({
|
||||
code: 1,
|
||||
stderr: "Type error on auth.ts:14",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(failR.pass).toBe(false);
|
||||
expect(failR.evidence).toContain("auth.ts:14");
|
||||
});
|
||||
|
||||
it("server_up passes on 200, fails otherwise", async () => {
|
||||
const pass = await runCheck(
|
||||
check("server_up"),
|
||||
ctx({
|
||||
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
|
||||
}),
|
||||
);
|
||||
expect(pass.pass).toBe(true);
|
||||
|
||||
const failR = await runCheck(
|
||||
check("server_up"),
|
||||
ctx({
|
||||
dev_server_start: JSON.stringify({ healthCheck: { status: 502 } }),
|
||||
}),
|
||||
);
|
||||
expect(failR.pass).toBe(false);
|
||||
});
|
||||
|
||||
it("route_ok matches the expected status code", async () => {
|
||||
const pass = await runCheck(
|
||||
check("route_ok", { url: "http://x/dashboard", expectedStatus: 200 }),
|
||||
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "200" }) }),
|
||||
);
|
||||
expect(pass.pass).toBe(true);
|
||||
|
||||
const failR = await runCheck(
|
||||
check("route_ok", { url: "http://x/dashboard" }),
|
||||
ctx({ shell_exec: JSON.stringify({ code: 0, stdout: "404" }) }),
|
||||
);
|
||||
expect(failR.pass).toBe(false);
|
||||
expect(failR.evidence).toContain("404");
|
||||
});
|
||||
|
||||
it("console_clean fails when an error is present", async () => {
|
||||
const failR = await runCheck(
|
||||
check("console_clean", { url: "http://x" }),
|
||||
ctx({
|
||||
browser_console: JSON.stringify([{ type: "error", text: "boom" }]),
|
||||
}),
|
||||
);
|
||||
expect(failR.pass).toBe(false);
|
||||
|
||||
const pass = await runCheck(
|
||||
check("console_clean", { url: "http://x" }),
|
||||
ctx({ browser_console: JSON.stringify([{ type: "log", text: "ok" }]) }),
|
||||
);
|
||||
expect(pass.pass).toBe(true);
|
||||
});
|
||||
|
||||
it("content checks for a substring", async () => {
|
||||
const pass = await runCheck(
|
||||
check("content", { url: "http://x", contains: "GetAcquired" }),
|
||||
ctx({
|
||||
shell_exec: JSON.stringify({ code: 0, stdout: "<h1>GetAcquired</h1>" }),
|
||||
}),
|
||||
);
|
||||
expect(pass.pass).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ── harness ──────────────────────────────────────────────────────────────
|
||||
|
||||
describe("harness", () => {
|
||||
it("auto-attaches the baseline contract", () => {
|
||||
const merged = withBaseline([], "http://preview");
|
||||
const kinds = merged.map((c) => c.kind).sort();
|
||||
expect(kinds).toContain("build");
|
||||
expect(kinds).toContain("server_up");
|
||||
expect(kinds).toContain("console_clean");
|
||||
});
|
||||
|
||||
it("does not duplicate a baseline check the planner already specified", () => {
|
||||
const merged = withBaseline([check("build")]);
|
||||
expect(merged.filter((c) => c.kind === "build").length).toBe(1);
|
||||
});
|
||||
|
||||
it("reports passed only when all hard checks pass", async () => {
|
||||
const report = await runVerificationContract(
|
||||
task([check("route_ok", { url: "http://x/d" })]),
|
||||
ctx({
|
||||
shell_exec: JSON.stringify({ code: 0, stdout: "200" }),
|
||||
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
|
||||
browser_console: JSON.stringify([]),
|
||||
}),
|
||||
{ shortCircuit: false },
|
||||
);
|
||||
expect(report.passed).toBe(true);
|
||||
expect(report.failures.length).toBe(0);
|
||||
});
|
||||
|
||||
it("short-circuits on the first hard failure", async () => {
|
||||
const exec = vi.fn(async (name: string) => {
|
||||
if (name === "shell_exec")
|
||||
return JSON.stringify({ code: 1, stderr: "build broke" });
|
||||
return "{}";
|
||||
});
|
||||
const report = await runVerificationContract(
|
||||
task([]),
|
||||
{ projectId: "p1", exec },
|
||||
{ shortCircuit: true },
|
||||
);
|
||||
expect(report.passed).toBe(false);
|
||||
// build is the first hard check; we should NOT have called dev_server_start.
|
||||
expect(exec).toHaveBeenCalledWith("shell_exec", expect.anything());
|
||||
expect(exec).not.toHaveBeenCalledWith(
|
||||
"dev_server_start",
|
||||
expect.anything(),
|
||||
);
|
||||
});
|
||||
|
||||
it("soft check failure does NOT block done", async () => {
|
||||
const report = await runVerificationContract(
|
||||
task([check("visual", { targetPath: "x" }, false)]),
|
||||
ctx({
|
||||
shell_exec: JSON.stringify({ code: 0 }),
|
||||
dev_server_start: JSON.stringify({ healthCheck: { status: 200 } }),
|
||||
browser_console: JSON.stringify([]),
|
||||
request_visual_qa: JSON.stringify({ score: 2 }),
|
||||
}),
|
||||
{ shortCircuit: false },
|
||||
);
|
||||
// visual scored 2 (would fail) but it's soft → does not block.
|
||||
expect(report.passed).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ── generation ───────────────────────────────────────────────────────────
|
||||
|
||||
describe("generation", () => {
|
||||
it("normalizes and caps acceptance checks, dropping unknown kinds", () => {
|
||||
const out = normalizeAcceptanceChecks([
|
||||
{ kind: "route_ok", spec: { url: "x" } },
|
||||
{ kind: "bogus" },
|
||||
{ kind: "content", spec: { url: "x", contains: "y" } },
|
||||
{ kind: "build" },
|
||||
{ kind: "data", spec: { command: "q" } },
|
||||
]);
|
||||
expect(out.length).toBe(3); // capped
|
||||
expect(out.find((c) => c.kind === ("bogus" as never))).toBeUndefined();
|
||||
});
|
||||
|
||||
it("defaults visual to a soft check", () => {
|
||||
const out = normalizeAcceptanceChecks([{ kind: "visual", spec: {} }]);
|
||||
expect(out[0].hard).toBe(false);
|
||||
});
|
||||
|
||||
it("formats actionable failure feedback", () => {
|
||||
const fb = formatFailureFeedback([
|
||||
{
|
||||
check: check("build"),
|
||||
pass: false,
|
||||
evidence: "Cannot find name foo (auth.ts:14)",
|
||||
},
|
||||
]);
|
||||
expect(fb).toContain("[VERIFICATION FAILED]");
|
||||
expect(fb).toContain("auth.ts:14");
|
||||
expect(fb).toContain("Do not claim success");
|
||||
});
|
||||
|
||||
it("failure signatures are stable and order-independent", () => {
|
||||
const a = failureSignature([
|
||||
{ check: check("build"), pass: false, evidence: "x" },
|
||||
{ check: check("route_ok"), pass: false, evidence: "y" },
|
||||
]);
|
||||
const b = failureSignature([
|
||||
{ check: check("route_ok"), pass: false, evidence: "y" },
|
||||
{ check: check("build"), pass: false, evidence: "x" },
|
||||
]);
|
||||
expect(a).toBe(b);
|
||||
});
|
||||
});
|
||||
|
||||
// ── executor fix-loop ────────────────────────────────────────────────────
|
||||
|
||||
const passReport = (): VerificationReport => ({
|
||||
passed: true,
|
||||
results: [],
|
||||
failures: [],
|
||||
});
|
||||
const failReport = (evidence: string): VerificationReport => ({
|
||||
passed: false,
|
||||
results: [],
|
||||
failures: [{ check: check("build"), pass: false, evidence }],
|
||||
});
|
||||
|
||||
describe("executeTask fix-loop", () => {
|
||||
it("FINALIZES immediately when the first verify passes", async () => {
|
||||
const runExecution = vi.fn(async () => {});
|
||||
const verify = vi.fn(async () => passReport());
|
||||
const out = await executeTask(task([]), { runExecution, verify });
|
||||
expect(out.status).toBe("done");
|
||||
expect(out.attempts).toBe(1);
|
||||
expect(runExecution).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("KEEPS FIXING then finalizes when a later attempt passes", async () => {
|
||||
const verify = vi
|
||||
.fn()
|
||||
.mockResolvedValueOnce(failReport("err A"))
|
||||
.mockResolvedValueOnce(failReport("err B")) // different evidence = progress
|
||||
.mockResolvedValueOnce(passReport());
|
||||
const feedbacks: string[] = [];
|
||||
const runExecution = vi.fn(async (a: { failureFeedback: string }) => {
|
||||
feedbacks.push(a.failureFeedback);
|
||||
});
|
||||
const out = await executeTask(task([]), { runExecution, verify });
|
||||
expect(out.status).toBe("done");
|
||||
expect(out.attempts).toBe(3);
|
||||
// The 2nd execution received the 1st attempt's concrete failure as context.
|
||||
expect(feedbacks[1]).toContain("err A");
|
||||
});
|
||||
|
||||
it("ESCALATES (blocked: no_progress) when the same failure repeats", async () => {
|
||||
const verify = vi.fn(async () => failReport("same error"));
|
||||
const runExecution = vi.fn(async () => {});
|
||||
const out = await executeTask(task([]), {
|
||||
runExecution,
|
||||
verify,
|
||||
noProgressLimit: 2,
|
||||
});
|
||||
expect(out.status).toBe("blocked");
|
||||
if (out.status === "blocked") expect(out.reason).toBe("no_progress");
|
||||
});
|
||||
|
||||
it("ESCALATES (blocked: max_attempts) if it never goes green but keeps changing", async () => {
|
||||
let n = 0;
|
||||
const verify = vi.fn(async () => failReport(`err ${n++}`)); // always different
|
||||
const runExecution = vi.fn(async () => {});
|
||||
const out = await executeTask(task([]), {
|
||||
runExecution,
|
||||
verify,
|
||||
maxAttempts: 3,
|
||||
});
|
||||
expect(out.status).toBe("blocked");
|
||||
if (out.status === "blocked") {
|
||||
expect(out.reason).toBe("max_attempts");
|
||||
expect(out.attempts).toBe(3);
|
||||
}
|
||||
});
|
||||
|
||||
it("persists progress on every attempt (resume support)", async () => {
|
||||
const verify = vi
|
||||
.fn()
|
||||
.mockResolvedValueOnce(failReport("e1"))
|
||||
.mockResolvedValueOnce(passReport());
|
||||
const persisted: number[] = [];
|
||||
const t = task([]);
|
||||
await executeTask(t, {
|
||||
runExecution: async () => {},
|
||||
verify,
|
||||
persist: (tk) => {
|
||||
persisted.push(tk.attempts);
|
||||
},
|
||||
});
|
||||
expect(persisted).toContain(1); // persisted the failing attempt
|
||||
expect(t.status).toBe("done");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user