chore(telemetry): refactor stall detector to track real state progress and persist non-null verify signatures across edit rounds

This commit is contained in:
2026-06-09 13:36:30 -07:00
parent 6ec312f716
commit de1209afe4

View File

@@ -792,50 +792,10 @@ export async function POST(request: Request) {
// detection. The model has a strong tendency to grind through a
// dozen+ tool calls in total silence (the user just sees ✓ pills
// pile up); both safeguards below break that pattern.
const toolFingerprints: string[] = [];
let roundsSinceText = 0;
let toolCallsSinceText = 0;
let loopBreakReason: string | null = null;
function fingerprintToolCall(tc: ToolCall) {
if (tc.name === "shell_exec") {
const cmd = String(tc.args?.command ?? "").trim();
// First non-cd verb (pkill, npm, curl, etc.)
const verb =
cmd
.split("&&")
.map((s: string) => s.trim())
.find((s: string) => !s.startsWith("cd "))
?.split(/\s+/)[0] ?? "shell";
return `shell_exec:${verb}`;
}
if (
tc.name === "fs_write" ||
tc.name === "fs_edit" ||
tc.name === "fs_read"
) {
return `${tc.name}:${tc.args?.path ?? ""}`;
}
if (
tc.name === "dev_server_start" ||
tc.name === "dev_server_stop" ||
tc.name === "dev_server_logs" ||
tc.name === "dev_server_list"
) {
return `dev_server:${tc.args?.port ?? "?"}`;
}
if (
tc.name === "apps_get" ||
tc.name === "apps_logs" ||
tc.name === "apps_deploy" ||
tc.name === "apps_unstick"
) {
return `${tc.name}:${tc.args?.uuid ?? ""}`;
}
const argSig = JSON.stringify(tc.args ?? {}).slice(0, 80);
return `${tc.name}:${argSig}`;
}
// ── Server-side conversational guard (C-03 enforcement) ───────────
// If the user's message looks conversational we withhold tools for
// round 1. The model MUST respond in text first. If its reply then
@@ -861,7 +821,7 @@ export async function POST(request: Request) {
isConversational(message.trim());
let lastVerifySig: string | null = null;
let lastRoundToolSig: string | null = null;
let fileHashes = new Map<string, string>();
let stallRounds = 0;
try {
@@ -952,39 +912,6 @@ export async function POST(request: Request) {
if (!resp.toolCalls.length) break;
if (aborted) break;
// Loop detection. If the model fires the same tool with the
// same first-key arg 3+ times in this turn, the user is
// watching it spin. Bail out, hand control back to the user
// with the last tool result as context. The classic case:
// dev_server.start → logs → stop → start → logs → stop → ...
for (const tc of resp.toolCalls) {
toolFingerprints.push(fingerprintToolCall(tc));
}
// Sliding window of 10 (was 8)
const window = toolFingerprints.slice(-10);
const counts = new Map<string, number>();
for (const fp of window) counts.set(fp, (counts.get(fp) ?? 0) + 1);
// Find highest repeating tool call
let maxRepeats = 0;
let repeatedCmd = "";
for (const [fp, n] of counts.entries()) {
if (n > maxRepeats) {
maxRepeats = n;
repeatedCmd = fp.split("|")[0];
}
}
// Hard-break at 6 identical fingerprints
if (maxRepeats === 4) {
extraSystem += `\n\n[WARNING] You have called ${repeatedCmd} four times recently. Try a different approach or surface what's blocking you to the user.`;
}
if (maxRepeats >= 6) {
loopBreakReason = `Repeated ${repeatedCmd} ${maxRepeats}× in last 10 calls`;
}
// Removed consecutive tool call hard-break logic because it interrupts valid long tool chains.
// Execute tool calls and add results. OpenAI-compatible APIs
// (DeepSeek, etc.) require every tool_call_id to be answered with
// a tool message before any user/assistant message — so recovery
@@ -1069,24 +996,38 @@ export async function POST(request: Request) {
// 1. Compute verify signature
const verifySig = getRoundVerifySignature(currentRoundResults);
// 2. Check for stall/progress by comparing tool call signatures (names + inputs)
const currentRoundToolSig = resp.toolCalls
.map((tc) => `${tc.name}:${JSON.stringify(tc.args || {})}`)
.sort()
.join(";;");
// 2. Check for actual state progress (did files change, did a plan update, did a mutating tool succeed, or did the error set change?)
const { progressed, nextHashes } = checkRoundProgress(
currentRoundResults,
fileHashes,
verifySig,
lastVerifySig,
);
fileHashes = nextHashes;
const progressed = !lastVerifySig || verifySig !== lastVerifySig;
const ranVerification = currentRoundResults.some((r) =>
[
"browser_console",
"shell_exec",
"dev_server_start",
"browser.console",
"dev.server.start",
].includes(r.toolName),
);
if (
verifySig &&
lastVerifySig &&
verifySig === lastVerifySig &&
!progressed
) {
loopBreakReason = `Blocked on persistent error: ${verifySig.split(";;")[0]}`;
if (ranVerification) {
if (verifySig) {
if (lastVerifySig && verifySig === lastVerifySig && !progressed) {
loopBreakReason = `Blocked on persistent error: ${verifySig.split(";;")[0]}`;
}
lastVerifySig = verifySig;
} else {
// Successfully compiled cleanly! Clear the active error memory
lastVerifySig = null;
}
}
if (lastRoundToolSig && currentRoundToolSig === lastRoundToolSig) {
if (!progressed) {
stallRounds++;
} else {
stallRounds = 0;
@@ -1094,12 +1035,9 @@ export async function POST(request: Request) {
if (stallRounds >= 2) {
loopBreakReason =
"Stalled (Repeated the exact same tool calls twice without advancing)";
"Stalled (No file state progress or diagnostic advancement made for 2 rounds)";
}
lastVerifySig = verifySig;
lastRoundToolSig = currentRoundToolSig;
if (loopBreakReason) break;
}
@@ -1454,35 +1392,34 @@ function getRoundVerifySignature(roundResults: any[]): string | null {
tr.toolName === "browser.console"
) {
if (
parsed.errors &&
Array.isArray(parsed.errors) &&
parsed.errors.length > 0
result.errors &&
Array.isArray(result.errors) &&
result.errors.length > 0
) {
// Normalize: replace preview subdomain hashes and port numbers to keep signature stable
const cleanErrors = parsed.errors.map((e: string) =>
e
.replace(/preview-\d+-\w+-\d+/g, "preview-X-url")
.replace(/localhost:\d+/g, "localhost:PORT")
.replace(/\d+/g, "N"),
const cleanErrors = result.errors.map((e: string) =>
normalizeError(e),
);
errors.push(`browser_console_errors:${cleanErrors.join("|")}`);
}
if (parsed.ok === false && parsed.error) {
errors.push(`browser_console_fail:${parsed.error}`);
if (result.ok === false && result.error) {
errors.push(`browser_console_fail:${normalizeError(result.error)}`);
}
}
// 2. Check shell_exec failures
if (tr.toolName === "shell_exec") {
if (parsed.code !== 0 && parsed.code !== undefined) {
const stderrLine = (parsed.stderr || parsed.stdout || "error")
if (result.code !== 0 && result.code !== undefined) {
const stderrLine = (result.stderr || result.stdout || "error")
.split("\n")[0]
.trim()
.substring(0, 100);
errors.push(`shell_exec_fail:${parsed.code}:${stderrLine}`);
errors.push(
`shell_exec_fail:${result.code}:${normalizeError(stderrLine)}`,
);
}
if (parsed.ok === false && parsed.error) {
errors.push(`shell_exec_error:${parsed.error}`);
if (result.ok === false && result.error) {
errors.push(`shell_exec_error:${normalizeError(result.error)}`);
}
}
@@ -1491,11 +1428,11 @@ function getRoundVerifySignature(roundResults: any[]): string | null {
tr.toolName === "dev_server_start" ||
tr.toolName === "dev.server.start"
) {
if (parsed.healthCheck && parsed.healthCheck.status >= 400) {
errors.push(`dev_server_unhealthy:${parsed.healthCheck.status}`);
if (result.healthCheck && result.healthCheck.status >= 400) {
errors.push(`dev_server_unhealthy:${result.healthCheck.status}`);
}
if (parsed.ok === false && parsed.error) {
errors.push(`dev_server_fail:${parsed.error}`);
if (result.ok === false && result.error) {
errors.push(`dev_server_fail:${normalizeError(result.error)}`);
}
}
@@ -1506,9 +1443,9 @@ function getRoundVerifySignature(roundResults: any[]): string | null {
tr.toolName === "fs.edit" ||
tr.toolName === "fs.write"
) {
if (parsed.ok === false || parsed.error) {
if (result.ok === false || result.error) {
errors.push(
`file_op_failed:${tr.toolName}:${parsed.error || parsed.stderr || "error"}`,
`file_op_failed:${tr.toolName}:${normalizeError(result.error || result.stderr || "error")}`,
);
}
}
@@ -1520,3 +1457,64 @@ function getRoundVerifySignature(roundResults: any[]): string | null {
if (errors.length === 0) return null;
return errors.sort().join(";;");
}
function normalizeError(error: string): string {
return error
.replace(/preview-\d+-\w+-[0-9a-f]+/g, "preview-X")
.replace(/localhost:\d+/g, "localhost:PORT")
.replace(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z/g, "TIMESTAMP")
.trim();
}
function checkRoundProgress(
roundResults: any[],
lastHashes: Map<string, string>,
verifySig: string | null,
lastVerifySig: string | null,
): { progressed: boolean; nextHashes: Map<string, string> } {
let progressed = false;
const nextHashes = new Map(lastHashes);
// A. Progress check: did the compile error signature change/improve?
if (verifySig !== lastVerifySig) {
progressed = true; // Error set changed/shifted = progress toward diagnosis!
}
for (const tr of roundResults) {
if (!tr.content) continue;
try {
const parsed = JSON.parse(tr.content);
const result = parsed.result || parsed;
// B. Progress check: did a file edit/write result in a new/changed sha256?
if (result.ok && result.sha256 && result.path) {
const lastHash = lastHashes.get(result.path);
if (lastHash !== result.sha256) {
progressed = true;
nextHashes.set(result.path, result.sha256);
}
}
// C. Progress check: did any mutating/deploying tool succeed?
if (
result.ok &&
![
"fs_read",
"fs_list",
"fs_tree",
"fs_glob",
"fs_grep",
"dev_server_list",
"browser_console",
"browser.console",
].includes(tr.toolName)
) {
progressed = true;
}
} catch (e) {
// skip
}
}
return { progressed, nextHashes };
}