Files

93 lines
2.9 KiB
TypeScript

/**
* Verification harness — runs a task's contract and returns a structured
* pass/fail report. This is the single source of truth for "is the task done".
*/
import type {
AcceptanceCheck,
CheckResult,
ExecCtx,
VerificationReport,
VerificationTask,
} from "./types";
import { runCheck } from "./runners";
/**
* The baseline contract auto-attached to every code task. Even if the Planner
* specifies no checks, a task can never be "done" while the app fails to build
* or the page throws — this is the floor that kills false-completion
* ("I scaffolded everything ✓" when nothing compiles).
*/
export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
const checks: AcceptanceCheck[] = [
{
kind: "build",
hard: true,
description: "Project builds without errors",
spec: {},
},
{
kind: "server_up",
hard: true,
description: "Dev server boots and responds 200",
spec: { port: 3000 },
},
];
// console_clean needs a URL to check. Only include it when we actually know
// the preview URL — otherwise we'd fail the whole contract on an un-runnable
// check. (When run inside the agent, the URL comes from dev_server_start.)
if (previewUrl) {
checks.push({
kind: "console_clean",
hard: true,
description: "Preview has no runtime console errors",
spec: { url: previewUrl },
});
}
return checks;
}
const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;
/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
export function withBaseline(
checks: AcceptanceCheck[],
previewUrl?: string,
): AcceptanceCheck[] {
const seen = new Set(checks.map(KEY));
const merged = [...checks];
for (const b of baselineChecks(previewUrl)) {
if (!seen.has(KEY(b))) merged.push(b);
}
// Run hard checks first so we short-circuit on the cheapest objective failure.
return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
}
export interface RunContractOptions {
/** Skip the auto-baseline (e.g. for a pure data/research task). */
noBaseline?: boolean;
/** Stop after the first HARD failure (cheaper). Default true. */
shortCircuit?: boolean;
}
export async function runVerificationContract(
task: VerificationTask,
ctx: ExecCtx,
opts: RunContractOptions = {},
): Promise<VerificationReport> {
const { noBaseline = false, shortCircuit = true } = opts;
const checks = noBaseline
? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
: withBaseline(task.acceptanceChecks, ctx.previewUrl);
const results: CheckResult[] = [];
for (const check of checks) {
const r = await runCheck(check, ctx);
results.push(r);
if (shortCircuit && !r.pass && check.hard) break;
}
const failures = results.filter((r) => !r.pass && r.check.hard);
return { passed: failures.length === 0, results, failures };
}