feat(verification): acceptance-check layer + executor fix-loop; hide phase-checkpoint walls; guaranteed turn-end summary. Verification gated behind VIBN_VERIFICATION_ENABLED.
This commit is contained in:
@@ -28,6 +28,14 @@ import {
|
||||
detectKnownError,
|
||||
formatRecoveryMessage,
|
||||
} from "@/lib/ai/error-recovery";
|
||||
import {
|
||||
executeTask,
|
||||
runVerificationContract,
|
||||
type ExecCtx,
|
||||
type ExecuteTaskOutcome,
|
||||
type ToolExecutor,
|
||||
type VerificationTask,
|
||||
} from "@/lib/ai/verification";
|
||||
|
||||
// --- Agent Orchestration Types & Constants ---
|
||||
type TurnIntent =
|
||||
@@ -668,6 +676,35 @@ function buildHealthStatus(opts: {
|
||||
);
|
||||
}
|
||||
|
||||
// Scan tool results (most-recent first) for a dev-server preview URL so the
|
||||
// verification layer can run console/route checks against the running app.
|
||||
function extractPreviewUrl(messages: ChatMessage[]): string | undefined {
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
const m = messages[i];
|
||||
if (m.role !== "tool" || typeof m.content !== "string") continue;
|
||||
if (!m.content.includes("preview")) continue;
|
||||
try {
|
||||
const p = JSON.parse(m.content) as Record<string, unknown>;
|
||||
if (typeof p.previewUrl === "string") return p.previewUrl;
|
||||
if (typeof p.stdout === "string") {
|
||||
try {
|
||||
const inner = JSON.parse(p.stdout) as Record<string, unknown>;
|
||||
if (typeof inner.previewUrl === "string") return inner.previewUrl;
|
||||
} catch {
|
||||
/* stdout not JSON */
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
/* not JSON */
|
||||
}
|
||||
const match = m.content.match(
|
||||
/https:\/\/[a-z0-9-]+\.preview\.vibnai\.com/i,
|
||||
);
|
||||
if (match) return match[0];
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export async function POST(request: Request) {
|
||||
await ensureChatTables();
|
||||
|
||||
@@ -1067,6 +1104,11 @@ export async function POST(request: Request) {
|
||||
let phase: AgentPhase = "recon";
|
||||
let checkpointEmitted = false;
|
||||
let verificationPassed = false;
|
||||
// When C-08 forces a "Phase Checkpoint" before a mutation, the model's
|
||||
// next reply is that internal planning block. We route it to the
|
||||
// (hidden) thinking channel instead of showing the user a wall of
|
||||
// Goal/Findings/Suspected-Cause text.
|
||||
let suppressNextTextAsCheckpoint = false;
|
||||
|
||||
// ── Server-side conversational guard (C-03 enforcement) ───────────
|
||||
// If the user's message looks conversational we withhold tools for
|
||||
@@ -1100,6 +1142,62 @@ export async function POST(request: Request) {
|
||||
let fileHashes = new Map<string, string>();
|
||||
let stallRounds = 0;
|
||||
|
||||
// Compact corrective executor used by the verification fix-loop: runs up
|
||||
// to `n` model rounds (with tools) to fix whatever verification flagged,
|
||||
// reusing the same tool-execution path as the main loop.
|
||||
async function runFixRounds(n: number) {
|
||||
for (let i = 0; i < n; i++) {
|
||||
if (aborted) break;
|
||||
const fixTools = activeMcpToken
|
||||
? filterToolsForPhase(VIBN_TOOL_DEFINITIONS, "execute", turnIntent)
|
||||
: [];
|
||||
const r = await callVibnChat({
|
||||
systemPrompt,
|
||||
messages,
|
||||
tools: fixTools,
|
||||
temperature: 0.4,
|
||||
includeThoughts: true,
|
||||
});
|
||||
if (r.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + r.text;
|
||||
assistantTextSegments.push(r.text);
|
||||
emit({ type: "text", text: r.text });
|
||||
}
|
||||
messages.push({
|
||||
role: "assistant",
|
||||
content: r.text,
|
||||
toolCalls: r.toolCalls.length ? r.toolCalls : undefined,
|
||||
});
|
||||
if (!r.toolCalls.length) break;
|
||||
for (const tc of r.toolCalls) {
|
||||
if (aborted) break;
|
||||
assistantToolCalls.push(tc);
|
||||
emit({ type: "tool_start", name: tc.name, args: tc.args });
|
||||
const result = activeMcpToken
|
||||
? await executeMcpTool(
|
||||
tc.name,
|
||||
tc.args,
|
||||
activeMcpToken,
|
||||
baseUrl,
|
||||
activeProject?.id,
|
||||
)
|
||||
: JSON.stringify({ error: "No MCP token" });
|
||||
emit({
|
||||
type: "tool_result",
|
||||
name: tc.name,
|
||||
result: result.slice(0, 500),
|
||||
});
|
||||
messages.push({
|
||||
role: "tool",
|
||||
content: result,
|
||||
toolCallId: tc.id,
|
||||
toolName: tc.name,
|
||||
thoughtSignature: tc.thoughtSignature,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
emit({ type: "phase", phase, label: "Investigating & Planning" });
|
||||
|
||||
try {
|
||||
@@ -1182,6 +1280,7 @@ export async function POST(request: Request) {
|
||||
findings: "Evaluating...",
|
||||
});
|
||||
checkpointEmitted = true;
|
||||
suppressNextTextAsCheckpoint = true;
|
||||
phase = "execute";
|
||||
emit({ type: "phase", phase, label: "Executing Code Edits" });
|
||||
continue; // Skip tool execution and re-prompt
|
||||
@@ -1202,8 +1301,14 @@ export async function POST(request: Request) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Stream user-facing text to client
|
||||
if (resp.text) {
|
||||
// Stream user-facing text to client.
|
||||
// If this round's text is the forced Phase Checkpoint, route it to
|
||||
// the hidden thinking channel and DON'T add it to the user-facing
|
||||
// message (so it never shows live or in the persisted thread).
|
||||
if (resp.text && suppressNextTextAsCheckpoint) {
|
||||
emit({ type: "thinking", text: resp.text });
|
||||
suppressNextTextAsCheckpoint = false;
|
||||
} else if (resp.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + resp.text;
|
||||
assistantTextSegments.push(resp.text);
|
||||
emit({ type: "text", text: resp.text });
|
||||
@@ -1420,6 +1525,82 @@ export async function POST(request: Request) {
|
||||
emit({ type: "aborted" });
|
||||
}
|
||||
|
||||
// ── Acceptance verification + corrective fix-loop (flag-gated) ──
|
||||
// After a turn that mutated code, run the verification contract
|
||||
// (baseline: build + server_up + console_clean). If it fails, feed the
|
||||
// concrete failures back and let the model fix — iterating until green,
|
||||
// stuck, or out of attempts. Off by default; enable per-environment
|
||||
// with VIBN_VERIFICATION_ENABLED=1 for the live smoke test.
|
||||
let verificationOutcome: ExecuteTaskOutcome | null = null;
|
||||
const MUTATION_TOOLS = [
|
||||
"fs_write",
|
||||
"fs_edit",
|
||||
"fs_delete",
|
||||
"apps_deploy",
|
||||
"ship",
|
||||
];
|
||||
const mutated = assistantToolCalls.some((tc) =>
|
||||
MUTATION_TOOLS.includes(tc.name),
|
||||
);
|
||||
if (
|
||||
process.env.VIBN_VERIFICATION_ENABLED === "1" &&
|
||||
!aborted &&
|
||||
mutated &&
|
||||
activeProject?.id &&
|
||||
activeMcpToken
|
||||
) {
|
||||
emit({ type: "phase", phase: "verify", label: "Verifying & fixing" });
|
||||
const previewUrl = extractPreviewUrl(messages);
|
||||
const verifyExec: ToolExecutor = async (name, args) =>
|
||||
executeMcpTool(
|
||||
name,
|
||||
args,
|
||||
activeMcpToken,
|
||||
baseUrl,
|
||||
activeProject!.id,
|
||||
);
|
||||
const vTask: VerificationTask = {
|
||||
id: thread_id,
|
||||
title: message,
|
||||
status: "in_progress",
|
||||
acceptanceChecks: [],
|
||||
attempts: 0,
|
||||
};
|
||||
const verifyCtx: ExecCtx = {
|
||||
projectId: activeProject.id,
|
||||
previewUrl,
|
||||
exec: verifyExec,
|
||||
};
|
||||
try {
|
||||
verificationOutcome = await executeTask(vTask, {
|
||||
maxAttempts: 3,
|
||||
runExecution: async ({ failureFeedback, attempt }) => {
|
||||
// Attempt 1 = verify what the main loop already produced.
|
||||
if (attempt === 1 && !failureFeedback) return;
|
||||
if (failureFeedback)
|
||||
messages.push({ role: "user", content: failureFeedback });
|
||||
await runFixRounds(2);
|
||||
},
|
||||
verify: async () => runVerificationContract(vTask, verifyCtx),
|
||||
});
|
||||
} catch (e) {
|
||||
console.error("[Verification] errored:", e);
|
||||
}
|
||||
// If verification couldn't reach green, surface the specific failing
|
||||
// checks as an honest status (and let the summary reflect reality).
|
||||
if (verificationOutcome?.status === "blocked") {
|
||||
const checkLines = verificationOutcome.failures
|
||||
.map((f) => `- ${f.check.description}: ${f.evidence}`)
|
||||
.join("\n");
|
||||
const note =
|
||||
`I made the changes but verification didn't fully pass:\n${checkLines}\n` +
|
||||
`That's the honest state — want me to keep working these specific issues?`;
|
||||
assistantText += (assistantText ? "\n\n" : "") + note;
|
||||
assistantTextSegments.push(note);
|
||||
emit({ type: "text", text: note });
|
||||
}
|
||||
}
|
||||
|
||||
// If the loop ended with the user staring at a tool tray and no
|
||||
// narrative — whether because we hit MAX_TOOL_ROUNDS, broke a
|
||||
// detected loop, or the model voluntarily stopped emitting tools
|
||||
@@ -1492,6 +1673,36 @@ export async function POST(request: Request) {
|
||||
assistantTextSegments.push(fallback);
|
||||
emit({ type: "text", text: fallback });
|
||||
}
|
||||
} else if (!aborted && anyToolsExecuted) {
|
||||
// Successful tool-using turn — guarantee it ENDS with a clean,
|
||||
// human summary. We only force one when the model didn't already
|
||||
// close with a substantive sentence, so we never pay for a
|
||||
// redundant double-summary.
|
||||
const lastSeg = (
|
||||
assistantTextSegments[assistantTextSegments.length - 1] || ""
|
||||
).trim();
|
||||
const alreadySummarized =
|
||||
lastSeg.length >= 40 && /[.!?)\]]$/.test(lastSeg);
|
||||
if (!alreadySummarized) {
|
||||
try {
|
||||
const finalSummary = await callVibnChat({
|
||||
systemPrompt:
|
||||
systemPrompt +
|
||||
`\n\n[FINAL SUMMARY] The work for this turn is finished. In 1–3 short, plain sentences, tell the user: (a) what you changed or accomplished, (b) the specific result they can see right now (a preview URL, a file, a value), and (c) the single best next step. No headings, no bullet lists, no internal jargon, and do NOT call any tools.`,
|
||||
messages,
|
||||
tools: [],
|
||||
temperature: 0.3,
|
||||
});
|
||||
if (finalSummary.text && finalSummary.text.trim()) {
|
||||
assistantText +=
|
||||
(assistantText ? "\n\n" : "") + finalSummary.text;
|
||||
assistantTextSegments.push(finalSummary.text);
|
||||
emit({ type: "text", text: finalSummary.text });
|
||||
}
|
||||
} catch {
|
||||
// Best-effort: the model's own final text remains as the ending.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Last-resort guard: the model produced NO user-facing text and NO
|
||||
|
||||
Reference in New Issue
Block a user