93 lines
2.9 KiB
TypeScript
93 lines
2.9 KiB
TypeScript
/**
|
|
* Verification harness — runs a task's contract and returns a structured
|
|
* pass/fail report. This is the single source of truth for "is the task done".
|
|
*/
|
|
|
|
import type {
|
|
AcceptanceCheck,
|
|
CheckResult,
|
|
ExecCtx,
|
|
VerificationReport,
|
|
VerificationTask,
|
|
} from "./types";
|
|
import { runCheck } from "./runners";
|
|
|
|
/**
|
|
* The baseline contract auto-attached to every code task. Even if the Planner
|
|
* specifies no checks, a task can never be "done" while the app fails to build
|
|
* or the page throws — this is the floor that kills false-completion
|
|
* ("I scaffolded everything ✓" when nothing compiles).
|
|
*/
|
|
export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
|
|
const checks: AcceptanceCheck[] = [
|
|
{
|
|
kind: "build",
|
|
hard: true,
|
|
description: "Project builds without errors",
|
|
spec: {},
|
|
},
|
|
{
|
|
kind: "server_up",
|
|
hard: true,
|
|
description: "Dev server boots and responds 200",
|
|
spec: { port: 3000 },
|
|
},
|
|
];
|
|
// console_clean needs a URL to check. Only include it when we actually know
|
|
// the preview URL — otherwise we'd fail the whole contract on an un-runnable
|
|
// check. (When run inside the agent, the URL comes from dev_server_start.)
|
|
if (previewUrl) {
|
|
checks.push({
|
|
kind: "console_clean",
|
|
hard: true,
|
|
description: "Preview has no runtime console errors",
|
|
spec: { url: previewUrl },
|
|
});
|
|
}
|
|
return checks;
|
|
}
|
|
|
|
const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;
|
|
|
|
/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
|
|
export function withBaseline(
|
|
checks: AcceptanceCheck[],
|
|
previewUrl?: string,
|
|
): AcceptanceCheck[] {
|
|
const seen = new Set(checks.map(KEY));
|
|
const merged = [...checks];
|
|
for (const b of baselineChecks(previewUrl)) {
|
|
if (!seen.has(KEY(b))) merged.push(b);
|
|
}
|
|
// Run hard checks first so we short-circuit on the cheapest objective failure.
|
|
return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
|
|
}
|
|
|
|
export interface RunContractOptions {
|
|
/** Skip the auto-baseline (e.g. for a pure data/research task). */
|
|
noBaseline?: boolean;
|
|
/** Stop after the first HARD failure (cheaper). Default true. */
|
|
shortCircuit?: boolean;
|
|
}
|
|
|
|
export async function runVerificationContract(
|
|
task: VerificationTask,
|
|
ctx: ExecCtx,
|
|
opts: RunContractOptions = {},
|
|
): Promise<VerificationReport> {
|
|
const { noBaseline = false, shortCircuit = true } = opts;
|
|
const checks = noBaseline
|
|
? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
|
|
: withBaseline(task.acceptanceChecks, ctx.previewUrl);
|
|
|
|
const results: CheckResult[] = [];
|
|
for (const check of checks) {
|
|
const r = await runCheck(check, ctx);
|
|
results.push(r);
|
|
if (shortCircuit && !r.pass && check.hard) break;
|
|
}
|
|
|
|
const failures = results.filter((r) => !r.pass && r.check.hard);
|
|
return { passed: failures.length === 0, results, failures };
|
|
}
|