vibn-agent-runner/vibn-frontend/lib/ai/verification/harness.ts

/**
 * Verification harness — runs a task's contract and returns a structured
 * pass/fail report. This is the single source of truth for "is the task done".
 */

import type {
  AcceptanceCheck,
  CheckResult,
  ExecCtx,
  VerificationReport,
  VerificationTask,
} from "./types";
import { runCheck } from "./runners";

/**
 * The baseline contract auto-attached to every code task. Even if the Planner
 * specifies no checks, a task can never be "done" while the app fails to build
 * or the page throws — this is the floor that kills false-completion
 * ("I scaffolded everything ✓" when nothing compiles).
 */
export function baselineChecks(previewUrl?: string): AcceptanceCheck[] {
  const checks: AcceptanceCheck[] = [
    {
      kind: "build",
      hard: true,
      description: "Project builds without errors",
      spec: {},
    },
    {
      kind: "server_up",
      hard: true,
      description: "Dev server boots and responds 200",
      spec: { port: 3000 },
    },
  ];
  // console_clean needs a URL to check. Only include it when we actually know
  // the preview URL — otherwise we'd fail the whole contract on an un-runnable
  // check. (When run inside the agent, the URL comes from dev_server_start.)
  if (previewUrl) {
    checks.push({
      kind: "console_clean",
      hard: true,
      description: "Preview has no runtime console errors",
      spec: { url: previewUrl },
    });
  }
  return checks;
}

const KEY = (c: AcceptanceCheck) => `${c.kind}:${JSON.stringify(c.spec ?? {})}`;

/** Merge the task's checks with the baseline, de-duplicating by kind+spec. */
export function withBaseline(
  checks: AcceptanceCheck[],
  previewUrl?: string,
): AcceptanceCheck[] {
  const seen = new Set(checks.map(KEY));
  const merged = [...checks];
  for (const b of baselineChecks(previewUrl)) {
    if (!seen.has(KEY(b))) merged.push(b);
  }
  // Run hard checks first so we short-circuit on the cheapest objective failure.
  return merged.sort((a, b) => Number(b.hard) - Number(a.hard));
}

export interface RunContractOptions {
  /** Skip the auto-baseline (e.g. for a pure data/research task). */
  noBaseline?: boolean;
  /** Stop after the first HARD failure (cheaper). Default true. */
  shortCircuit?: boolean;
}

export async function runVerificationContract(
  task: VerificationTask,
  ctx: ExecCtx,
  opts: RunContractOptions = {},
): Promise<VerificationReport> {
  const { noBaseline = false, shortCircuit = true } = opts;
  const checks = noBaseline
    ? [...task.acceptanceChecks].sort((a, b) => Number(b.hard) - Number(a.hard))
    : withBaseline(task.acceptanceChecks, ctx.previewUrl);

  const results: CheckResult[] = [];
  for (const check of checks) {
    const r = await runCheck(check, ctx);
    results.push(r);
    if (shortCircuit && !r.pass && check.hard) break;
  }

  const failures = results.filter((r) => !r.pass && r.check.hard);
  return { passed: failures.length === 0, results, failures };
}