feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.

This commit is contained in:
2026-06-10 19:43:36 -07:00
parent 46291becd3
commit 39cb9194a5
9 changed files with 1263 additions and 31 deletions

View File

@@ -28,6 +28,14 @@ import {
detectKnownError,
formatRecoveryMessage,
} from "@/lib/ai/error-recovery";
import {
executeTask,
runVerificationContract,
type ExecCtx,
type ExecuteTaskOutcome,
type ToolExecutor,
type VerificationTask,
} from "@/lib/ai/verification";
// --- Agent Orchestration Types & Constants ---
type TurnIntent =
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
);
}
// Scan tool results (most-recent first) for a dev-server preview URL so the
// verification layer can run console/route checks against the running app.
function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
for (let i = messages.length - 1; i >= 0; i--) {
const m = messages[i];
if (m.role !== "tool" || typeof m.content !== "string") continue;
if (!m.content.includes("preview")) continue;
try {
const p = JSON.parse(m.content) as Record<string, unknown>;
if (typeof p.previewUrl === "string") return p.previewUrl;
if (typeof p.stdout === "string") {
try {
const inner = JSON.parse(p.stdout) as Record<string, unknown>;
if (typeof inner.previewUrl === "string") return inner.previewUrl;
} catch {
/* stdout not JSON */
}
}
} catch {
/* not JSON */
}
const match = m.content.match(
/https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
);
if (match) return match[0];
}
return undefined;
}
export async function POST(request: Request) {
await ensureChatTables();
@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
let phase: AgentPhase = "recon";
let checkpointEmitted = false;
let verificationPassed = false;
// When C-08 forces a "Phase Checkpoint" before a mutation, the model's
// next reply is that internal planning block. We route it to the
// (hidden) thinking channel instead of showing the user a wall of
// Goal/Findings/Suspected-Cause text.
let suppressNextTextAsCheckpoint = false;
// ── Server-side conversational guard (C-03 enforcement) ───────────
// If the user's message looks conversational we withhold tools for
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
let fileHashes = new Map<string, string>();
let stallRounds = 0;
// Compact corrective executor used by the verification fix-loop: runs up
// to `n` model rounds (with tools) to fix whatever verification flagged,
// reusing the same tool-execution path as the main loop.
async function runFixRounds(n: number) {
for (let i = 0; i < n; i++) {
if (aborted) break;
const fixTools = activeMcpToken
? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
: [];
const r = await callVibnChat({
systemPrompt,
messages,
tools: fixTools,
temperature: 0.4,
includeThoughts: true,
});
if (r.text) {
assistantText += (assistantText ? "\n\n" : "") + r.text;
assistantTextSegments.push(r.text);
emit({ type: "text", text: r.text });
}
messages.push({
role: "assistant",
content: r.text,
toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
});
if (!r.toolCalls.length) break;
for (const tc of r.toolCalls) {
if (aborted) break;
assistantToolCalls.push(tc);
emit({ type: "tool_start", name: tc.name, args: tc.args });
const result = activeMcpToken
? await executeMcpTool(
tc.name,
tc.args,
activeMcpToken,
baseUrl,
activeProject?.id,
)
: JSON.stringify({ error: "No MCP token" });
emit({
type: "tool_result",
name: tc.name,
result: result.slice(0, 500),
});
messages.push({
role: "tool",
content: result,
toolCallId: tc.id,
toolName: tc.name,
thoughtSignature: tc.thoughtSignature,
});
}
}
}
emit({ type: "phase", phase, label: "Investigating & Planning" });
try {
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
findings: "Evaluating...",
});
checkpointEmitted = true;
suppressNextTextAsCheckpoint = true;
phase = "execute";
emit({ type: "phase", phase, label: "Executing Code Edits" });
continue; // Skip tool execution and re-prompt
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
return;
}
// Stream user-facing text to client
if (resp.text) {
// Stream user-facing text to client.
// If this round's text is the forced Phase Checkpoint, route it to
// the hidden thinking channel and DON'T add it to the user-facing
// message (so it never shows live or in the persisted thread).
if (resp.text && suppressNextTextAsCheckpoint) {
emit({ type: "thinking", text: resp.text });
suppressNextTextAsCheckpoint = false;
} else if (resp.text) {
assistantText += (assistantText ? "\n\n" : "") + resp.text;
assistantTextSegments.push(resp.text);
emit({ type: "text", text: resp.text });
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
emit({ type: "aborted" });
}
// ── Acceptance verification + corrective fix-loop (flag-gated) ──
// After a turn that mutated code, run the verification contract
// (baseline: build + server_up + console_clean). If it fails, feed the
// concrete failures back and let the model fix — iterating until green,
// stuck, or out of attempts. Off by default; enable per-environment
// with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
let verificationOutcome: ExecuteTaskOutcome | null = null;
const MUTATION_TOOLS = [
"fs_write",
"fs_edit",
"fs_delete",
"apps_deploy",
"ship",
];
const mutated = assistantToolCalls.some((tc) =>
MUTATION_TOOLS.includes(tc.name),
);
if (
process.env.VIBN_VERIFICATION_ENABLED === "1" &&
!aborted &&
mutated &&
activeProject?.id &&
activeMcpToken
) {
emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
const previewUrl = extractPreviewUrl(messages);
const verifyExec: ToolExecutor = async (name, args) =>
executeMcpTool(
name,
args,
activeMcpToken,
baseUrl,
activeProject!.id,
);
const vTask: VerificationTask = {
id: thread_id,
title: message,
status: "in_progress",
acceptanceChecks: [],
attempts: 0,
};
const verifyCtx: ExecCtx = {
projectId: activeProject.id,
previewUrl,
exec: verifyExec,
};
try {
verificationOutcome = await executeTask(vTask, {
maxAttempts: 3,
runExecution: async ({ failureFeedback, attempt }) => {
// Attempt 1 = verify what the main loop already produced.
if (attempt === 1 && !failureFeedback) return;
if (failureFeedback)
messages.push({ role: "user", content: failureFeedback });
await runFixRounds(2);
},
verify: async () => runVerificationContract(vTask, verifyCtx),
});
} catch (e) {
console.error("[Verification] errored:", e);
}
// If verification couldn't reach green, surface the specific failing
// checks as an honest status (and let the summary reflect reality).
if (verificationOutcome?.status === "blocked") {
const checkLines = verificationOutcome.failures
.map((f) => `- ${f.check.description}: ${f.evidence}`)
.join("\n");
const note =
`I made the changes but verification didn't fully pass:\n${checkLines}\n` +
`That's the honest state — want me to keep working these specific issues?`;
assistantText += (assistantText ? "\n\n" : "") + note;
assistantTextSegments.push(note);
emit({ type: "text", text: note });
}
}
// If the loop ended with the user staring at a tool tray and no
// narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
// detected loop, or the model voluntarily stopped emitting tools
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
assistantTextSegments.push(fallback);
emit({ type: "text", text: fallback });
}
} else if (!aborted && anyToolsExecuted) {
// Successful tool-using turn — guarantee it ENDS with a clean,
// human summary. We only force one when the model didn't already
// close with a substantive sentence, so we never pay for a
// redundant double-summary.
const lastSeg = (
assistantTextSegments[assistantTextSegments.length - 1] || ""
).trim();
const alreadySummarized =
lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
if (!alreadySummarized) {
try {
const finalSummary = await callVibnChat({
systemPrompt:
systemPrompt +
`\n\n[FINAL SUMMARY] The work for this turn is finished. In 13 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
messages,
tools: [],
temperature: 0.3,
});
if (finalSummary.text && finalSummary.text.trim()) {
assistantText +=
(assistantText ? "\n\n" : "") + finalSummary.text;
assistantTextSegments.push(finalSummary.text);
emit({ type: "text", text: finalSummary.text });
}
} catch {
// Best-effort: the model's own final text remains as the ending.
}
}
}
// Last-resort guard: the model produced NO user-facing text and NO