diff --git a/suggested-fixes.md b/suggested-fixes.md new file mode 100644 index 0000000..f021e22 --- /dev/null +++ b/suggested-fixes.md @@ -0,0 +1,718 @@ +# Vibn AI Chat Route — Fix Instructions + +This document specifies fixes to the `POST /api/chat` route and the tools it +orchestrates. The goal is to eliminate the "hallucinated tool calls" failure +mode where the assistant confidently claims to have edited files, restarted +servers, and applied design changes — but in reality wrote to a ghost path, +left a zombie process, or never invoked the tool at all. + +Each fix is self-contained, numbered, and ends with verification steps. Apply +in order. Fixes 1 and 2 are the root causes; the rest are guardrails that +only become reliable after 1 and 2 land. + +--- + +## Fix 1 — Make `dev_server_start` and `fs_*` tools project-path-aware + +### Problem + +The system prompt instructs the model to `cd &&` before every dev +server command and to write files under `/workspace//`. Nothing +enforces this. When the model forgets — or hallucinates that it +remembered — `fs_write` silently creates a file at `/workspace/src/app/page.tsx` +while the running app lives at `/workspace//src/app/page.tsx`. The +file exists; it just has zero relationship to the server. + +### Required changes + +In the tool implementations (likely in `@/lib/ai/vibn-tools` or wherever +`executeMcpTool` dispatches): + +1. **Add a `projectSlug` parameter to the tool execution context.** Plumb + `activeProject.slug` from the chat route into `executeMcpTool`. It is + already passed `activeProject?.id`; add slug alongside. + +2. **For `fs_read`, `fs_write`, `fs_edit`, `fs_glob`, `fs_grep`, `fs_list`, + `fs_delete`:** when `projectSlug` is set on the execution context, resolve + the `path` argument as follows: + - If `path` starts with `/workspace//`, accept as-is. + - If `path` is relative (no leading slash), resolve relative to + `/workspace//`. + - If `path` starts with `/workspace/` but **not** `/workspace//`, + **reject the call** with an error: + `PATH_OUTSIDE_PROJECT: refusing to write to '' because it is + outside the active project at '/workspace//'. Did you mean + '/workspace//'?` + - If `path` starts with any other absolute path, reject the same way. + +3. **For `dev_server_start`:** when `projectSlug` is set on the execution + context, automatically prepend `cd /workspace// && ` to the command + if the command does not already start with `cd /workspace/` or + `cd `. The model's prompt-level instruction to `cd` first becomes + belt-and-suspenders; the tool guarantees the working directory. + +4. **For `shell_exec`:** do NOT auto-prepend `cd` (the model legitimately runs + commands at `/workspace` root for git ops on multiple repos). Instead, + when `projectSlug` is set and the command does not contain `cd ` or + `cd /workspace/` anywhere, append a one-time warning to the tool + result: `[NOTE] You ran a shell command without cd'ing into + /workspace/. If you intended to operate on the active project, the + next call should start with 'cd && '.` Do not block — just inform. + +### Verification + +- Call `fs_write { path: '/workspace/src/app/page.tsx', content: 'x' }` from + a project-scoped thread. Expect `PATH_OUTSIDE_PROJECT` error. +- Call `fs_write { path: 'src/app/page.tsx', content: 'x' }` from a + project-scoped thread. Expect success at `/workspace//src/app/page.tsx`. +- Call `dev_server_start { command: 'npm run dev', port: 3000 }` from a + project-scoped thread. Expect the underlying exec to run from + `/workspace//`, not `/workspace/`. + +--- + +## Fix 2 — Add verification data to mutation tool results + +### Problem + +The model claims "I just rewrote `page.tsx`" with no way for itself or the +harness to verify it. The next turn, the model sees its own confident claim +in history and treats it as fact. Hallucination compounds across turns. + +### Required changes + +Edit the return shape of these tools (likely in `@/lib/ai/vibn-tools`): + +1. **`fs_write` and `fs_edit`** must return a JSON object including: + ```json + { + "ok": true, + "path": "/workspace//src/app/page.tsx", + "bytes": 4821, + "sha256": "a3f5...", + "mtime": "2026-05-16T18:42:11Z", + "preview": "first 200 chars of the new file content" + } + ``` + On failure, return `{ "ok": false, "error": "..." }` with no `sha256`. + +2. **`dev_server_start`** must, on successful launch, perform a server-side + `curl -sS -o - -w '\n[status=%{http_code}]' http://localhost:/` + with a 5s timeout, and include the result in the tool response: + ```json + { + "ok": true, + "previewUrl": "https://...", + "port": 3000, + "healthCheck": { + "status": 200, + "bodyBytes": 18432, + "bodyHash": "b7c2...", + "bodySnippet": " { + const tr = rawResults.find((r) => r.name === tc.name); + let resultSig = "(no result captured)"; + if (tr) { + try { + const parsed = typeof tr.result === "string" + ? JSON.parse(tr.result) + : tr.result; + if (parsed && typeof parsed === "object") { + if (parsed.ok === false) { + resultSig = `ERROR: ${parsed.error ?? "unknown"}`; + } else if (parsed.sha256) { + resultSig = `ok bytes=${parsed.bytes} sha=${parsed.sha256.slice(0, 8)}`; + } else if (parsed.previewUrl) { + resultSig = `ok previewUrl=${parsed.previewUrl} health=${parsed.healthCheck?.status ?? "?"}`; + } else if (parsed.uuid) { + resultSig = `ok uuid=${parsed.uuid}`; + } else { + resultSig = "ok"; + } + } + } catch { + resultSig = String(tr.result).slice(0, 80); + } + } + const argSig = JSON.stringify(tc.args ?? {}).slice(0, 100); + return ` - ${tc.name}(${argSig}) → ${resultSig}`; + }).join("\n"); + const suffix = `\n\n[tools executed this turn:\n${summary}\n]`; + msg.content = (msg.content ?? "") + suffix; + msg.toolCalls = undefined; + } + ``` + +2. This requires `_rawToolResults` to be persisted on every assistant + message — the code already does this (see the `_rawToolResults` block + near the end of the stream handler). No schema change needed. + +### Verification + +- Send a message, let the model edit a file, then send a follow-up. In + the second turn's history (log it server-side or inspect via the model's + reasoning), the assistant's previous message must end with a + `[tools executed this turn: ...]` block listing the actual tool calls + and their `sha256` / `previewUrl` / error result. +- Confirm DeepSeek and Gemini both accept the modified history without + the orphan-tool_calls error. + +--- + +## Fix 4 — Broaden loop detection to semantic patterns + +### Problem + +Current detection fingerprints `tool_name|args` and bails on 3 identical +calls. In real loops the args differ slightly: `shell_exec pkill -f next` +→ `shell_exec npm run dev` → `shell_exec curl localhost:3000` → repeat. +Different `command` strings, no fingerprint collision, loop runs to the +30-round cap. + +### Required changes + +In `/api/chat`, replace the fingerprint construction: + +```js +function fingerprintToolCall(tc) { + if (tc.name === "shell_exec") { + const cmd = String(tc.args?.command ?? "").trim(); + // First non-cd verb (pkill, npm, curl, etc.) + const verb = cmd + .split("&&") + .map((s) => s.trim()) + .find((s) => !s.startsWith("cd ")) + ?.split(/\s+/)[0] ?? "shell"; + return `shell_exec:${verb}`; + } + if (tc.name === "fs_write" || tc.name === "fs_edit" || tc.name === "fs_read") { + return `${tc.name}:${tc.args?.path ?? ""}`; + } + if (tc.name === "dev_server_start" || tc.name === "dev_server_stop" || + tc.name === "dev_server_logs" || tc.name === "dev_server_list") { + return `dev_server:${tc.args?.port ?? "?"}`; + } + if (tc.name === "apps_get" || tc.name === "apps_logs" || + tc.name === "apps_deploy" || tc.name === "apps_unstick") { + return `${tc.name}:${tc.args?.uuid ?? ""}`; + } + const argSig = JSON.stringify(tc.args ?? {}).slice(0, 80); + return `${tc.name}:${argSig}`; +} +``` + +Then in the loop body: + +```js +for (const tc of resp.toolCalls) { + toolFingerprints.push(fingerprintToolCall(tc)); +} +// Sliding window of 10 (was 8); threshold 3 stays the same +const window = toolFingerprints.slice(-10); +const counts = new Map(); +for (const fp of window) counts.set(fp, (counts.get(fp) ?? 0) + 1); +const repeated = [...counts.entries()].find(([, n]) => n >= 3); +if (repeated) { + loopBreakReason = `Repeated ${repeated[0]} ${repeated[1]}× in last 10 calls`; +} +``` + +### Verification + +- Simulate the log's failure pattern: `shell_exec pkill -f next` → + `shell_exec npm run dev` → `shell_exec curl localhost:3000` → repeat + three times. The new fingerprinter should report + `Repeated shell_exec:pkill 3× in last 10 calls` and trigger + `loopBreakReason`. +- Confirm a legitimate fan-out (e.g. 5 different `fs_write` to 5 + different files) does NOT trigger because each fingerprint is unique. + +--- + +## Fix 5 — Lower silent-stretch nudge threshold to 2 rounds OR 5 tool calls + +### Problem + +Current `roundsSinceText >= 4` is too lenient. By round 4 the user has +already typed "are you there?" — exactly what happened at messages +12–22 of the failing session. Also, `roundsSinceText` counts rounds, +not tool calls: one round with 8 parallel `fs_write` calls still reads +as a single round. + +### Required changes + +In `/api/chat`, track both round-count and tool-count since last text: + +```js +let roundsSinceText = 0; +let toolCallsSinceText = 0; +``` + +After each model response: + +```js +if (resp.text) { + // ... existing emit ... + roundsSinceText = 0; + toolCallsSinceText = 0; +} else if (resp.toolCalls.length) { + roundsSinceText++; + toolCallsSinceText += resp.toolCalls.length; +} +``` + +And update the nudge condition: + +```js +const isSilent = roundsSinceText >= 2 || toolCallsSinceText >= 5; +let extraSystem = isSilent + ? "\n\n[STATUS NUDGE] You have run " + + `${toolCallsSinceText} tool call(s) over ${roundsSinceText} round(s) ` + + "without sending the user any text. Before any more tool calls, " + + "send ONE short sentence describing what you are currently working " + + "on and why. The user is staring at silent tool pills." + : ""; +``` + +### Verification + +- Trigger a turn where the model fires 6 parallel tool calls in one round + with no text. The next round must receive the `[STATUS NUDGE]` system + injection. +- Trigger a turn where the model fires 1 tool call per round across 3 + rounds with no text. The third round must receive the nudge. + +--- + +## Fix 6 — Broaden recovery-summary trigger beyond `assistantText.length === 0` + +### Problem + +The post-loop recovery summary only fires when the model produced **no +text at all**. In the failing log, the model produced lots of confident +text — it just happened to be wrong about what its tools did. The +recovery path never engaged. + +### Required changes + +In `/api/chat`, replace the `needsRecovery` condition: + +```js +// Inspect the last 3 tool results for any explicit failure (ok: false, +// exit code != 0, http status >= 400). If the model wrote text but its +// final operations failed, we still want a forced honest summary. +function lastToolResultsHadFailure(messages, lookback = 3) { + const toolMsgs = messages.filter((m) => m.role === "tool").slice(-lookback); + for (const tm of toolMsgs) { + const raw = typeof tm.content === "string" ? tm.content : ""; + try { + const parsed = JSON.parse(raw); + if (parsed.ok === false) return true; + if (typeof parsed.exitCode === "number" && parsed.exitCode !== 0) return true; + if (parsed.healthCheck?.status && parsed.healthCheck.status >= 400) return true; + if (typeof parsed.error === "string" && parsed.error.length > 0) return true; + } catch { + // non-JSON result, skip + } + } + return false; +} + +const lastTurnHadTools = + messages.length > 0 && messages[messages.length - 1].role === "tool"; +const needsRecovery = + !aborted && + lastTurnHadTools && + (round >= MAX_TOOL_ROUNDS || + !!loopBreakReason || + assistantText.trim().length === 0 || + lastToolResultsHadFailure(messages)); +``` + +When the recovery summary fires due to `lastToolResultsHadFailure`, the +injected `[RECOVERY]` system message should also note the failure: + +```js +const failureNote = lastToolResultsHadFailure(messages) + ? "Your last tool calls returned failures or non-2xx health checks. " + + "Do NOT claim those operations succeeded. " + : ""; +const reason = loopBreakReason + ? `LOOP DETECTED: ${loopBreakReason}. Stop trying that approach. ` + : round >= MAX_TOOL_ROUNDS + ? "You hit the tool-round cap. " + : ""; +// ... use `${reason}${failureNote}` in the [RECOVERY] prompt ... +``` + +### Verification + +- Stub `dev_server_start` to return `{ ok: false, error: "EADDRINUSE" }`. + Send a request that causes the model to call it. Confirm a recovery + summary fires even if the model already wrote upbeat text. +- Confirm the recovery prompt includes the failure-note text in that case. + +--- + +## Fix 7 — Lower `MAX_TOOL_ROUNDS` from 30 to 15 + +### Problem + +30 rounds is too generous. In the failing log, none of the painful turns +that went past round 15 produced anything useful — they were just the +model trying to recover from its own hallucinations. The cap should +encourage handoff to the user, not extended thrashing. + +### Required changes + +In `/api/chat`: + +```js +const MAX_TOOL_ROUNDS = 15; +``` + +The "[WARNING] You only have N tool calls left" injection at +`MAX_TOOL_ROUNDS - round <= 3` will naturally fire at round 12, which +is still plenty of headroom for the legitimate Path-B scaffold workflows +(scaffold → install → configure → start = ~7 tool calls). + +### Verification + +- Confirm a normal "build me a Next.js app and start the dev server" + request completes in well under 15 rounds. +- Confirm the warning prompt fires at round 13 (cap minus 3 minus 1 + due to comparison being `<= 3` with round already incremented). + +--- + +## Fix 8 — Add a hard-rule prompt clause forbidding unverified mutation claims + +### Problem + +The current "Never fake success" prompt clause is vibes, not a checkable +rule. The model has no instruction telling it *how* to know whether it +faked success. + +### Required changes + +In `buildSystemPrompt`, in the "Hard rules (non-negotiable)" section, +add the following clauses: + +``` +- **Cite the tool result, don't claim from memory.** Before stating + "I edited X" or "the server is running," you must point to a tool + result from THIS turn. If you can't, say "I have not yet made that + change — running the tool now" and then run it. A claim without a + citable tool result is a hallucination. +- **Trust the `ok` field.** Tool results carry an explicit `ok: + true|false`. If `ok` is false (or absent, or `exitCode` is non-zero, + or `healthCheck.status` is >= 400), the operation FAILED. Do not + describe a failed operation as successful. Report the error + verbatim. +- **`fs_write` and `fs_edit` results carry a `sha256` and `bytes` + field on success.** When you tell the user a file was changed, + include the byte count or the first 6 chars of the sha as evidence: + "Updated `page.tsx` (4.8kb, sha=a3f5c2…)." This protects both of + you from drift. +- **`dev_server_start` results carry a `healthCheck` field on + success.** Before telling the user "the preview is ready," confirm + `healthCheck.status === 200`. If it's 502 or empty, the server + isn't actually serving — report that, don't paper over it. +``` + +### Verification + +- Inspect a sample of model outputs after this fix lands. They should + now include phrases like "Updated `page.tsx` (sha=a3f5c2…)" or + "Preview is up (health 200)" rather than bare "I just rewrote + page.tsx." +- Send a request that causes `dev_server_start` to return + `healthCheck.status: 502`. The model must NOT say "preview is + ready" — it must surface the 502. + +--- + +## Fix 9 — Make `autoExtractPlanUpdates` non-authoritative until user-confirmed + +### Problem + +`autoExtractPlanUpdates` is fire-and-forget after every turn and writes +directly into `plan.decisions`, which the next turn's system prompt then +treats as a hard "DO NOT re-litigate" instruction. A single misfire (e.g. +the model says "I logged this as our Global Design System" → extraction +agrees → next turn locks in a decision that was never user-confirmed) +poisons the project state durably. + +### Required changes + +1. In the plan schema, add a `confidence` and `source` field to each + decision/task/idea written by auto-extraction: + ```ts + { title, choice, why, confidence: "auto" | "user", source: "extracted" | "explicit" } + ``` + +2. In `autoExtractPlanUpdates`, write new items with `confidence: "auto"`, + `source: "extracted"`. + +3. In `buildSystemPrompt`, in `decisionsBlock`, split decisions into two + lists: + ``` + **Decisions confirmed by the user (do not re-litigate):** + - + + **Decisions inferred from prior chat (treat as tentative — confirm + with the user before relying on them):** + - + ``` + +4. When the user explicitly confirms (the model calls `plan_decision_log` + with the user's exact words, or the user clicks a confirm UI in the + Plan tab), upgrade `confidence` to `"user"`. + +### Verification + +- Trigger an auto-extraction that writes a decision. Confirm it appears + in the "tentative" list in the next turn's system prompt, not the + authoritative list. +- Have the user confirm the decision. Confirm `confidence` flips to + `"user"` and it moves to the authoritative list. + +--- + +## Fix 10 — Surface auto-commit result in the user reply when something shipped + +### Problem + +`commitAndPushIfDirty` runs fire-and-forget AFTER the assistant message is +persisted. The model never sees the commit SHA or success state in-context, +so it cannot honestly report "committed as abc123." It just hopes. + +### Required changes + +Move the commit step to run BEFORE the final `emit({ type: "done" })`, +and wait for its result with a tight timeout (e.g. 8s). Stream the result +to the client as a structured event: + +```js +if (activeProject?.id && activeProject?.slug && typeof activeProject?.giteaCloneUrl === "string") { + try { + await ensureProjectRepoCloned({ /* ... */ }).catch(() => null); + const firstSentence = (assistantText || "") + .split(/(?<=[.!?])\s+/)[0]?.trim()?.slice(0, 180); + const commitMessage = firstSentence || "AI checkpoint"; + + const commitPromise = commitAndPushIfDirty({ + projectId: activeProject.id, + projectSlug: activeProject.slug, + message: commitMessage, + }); + const timeoutPromise = new Promise((resolve) => + setTimeout(() => resolve({ committed: false, reason: "timeout" }), 8000) + ); + const result = await Promise.race([commitPromise, timeoutPromise]); + + if (result.committed) { + emit({ type: "commit", sha: result.sha, pushed: result.pushed }); + } else if (result.reason && result.reason !== "clean" && result.reason !== "no_repo") { + emit({ type: "commit_failed", reason: result.reason }); + } + } catch (err) { + emit({ type: "commit_failed", reason: String(err) }); + } +} +``` + +The client should render `commit` events as a small inline badge ("✓ +abc123f"). The model itself can be made aware on the NEXT turn via the +tool-history rewrite from Fix 3 — include the commit SHA in the +synthetic `[tools executed this turn: ...]` block. + +### Verification + +- Make a file edit. Confirm the chat reply ends with a visible commit SHA + badge in the UI. +- On a clean turn (no file changes), confirm no commit event fires. +- On a commit timeout (simulate by stubbing `commitAndPushIfDirty` to + hang), confirm the response still completes within ~8s with a + `commit_failed: reason=timeout` event. + +--- + +## Fix 11 — Populate and front-load `buildCodebaseSummary` in the system prompt + +### Problem + +Evidence from the failing session: the agent ran `fs_tree` 5 times and +`fs_glob "**/page.tsx"` 9+ times across one session, repeatedly +re-discovering the same file location. Path tallies across the session +showed FIVE different path conventions used for the homepage: +`getacquired-2-0/src/app/page.tsx` (70x), `src/app/page.tsx` (30x), +empty string `""` (19x), `/workspace/getacquired-2-0/src/app/page.tsx` +(10x), and `app/page.tsx` (1x). The agent never learned which one the +tools expected because the codebase summary that should have told it +was missing or empty. + +Separately, project plan context (vision, audience, brief) is technically +included in the system prompt, but the agent only called `plan_get` ONCE +in 65 turns — and only after the user explicitly asked "can you look at +the plan please to understand the market we are serving." For 16 turns +before that, the agent designed a Wall Street aesthetic for a marketplace +that was actually selling HVAC and plumbing businesses, because the +audience field was buried in a long `activeBlock` after operational +instructions. + +### Required changes + +1. **Make `buildCodebaseSummary` actually return content.** Inspect + `@/lib/ai/project-context/codebase-summary` and verify it: + - Detects the project root by finding `package.json` under + `/workspace//`. + - Reads framework signals (Next.js vs Vite, React vs Vue) from + `package.json` dependencies. + - Lists 10–20 most-important file paths grouped by purpose + (app/, components/, lib/, public/, api/, schema/). + - Returns a deterministic block under ~1500 chars, e.g.: + ``` + [CODEBASE — /workspace//] + Framework: Next.js 16.2 (App Router) + Tailwind + Sentry + Package manager: npm (lockfile: package-lock.json) + Entry points: + - src/app/page.tsx (homepage) + - src/app/layout.tsx (root layout) + - src/app/api/auth/[...nextauth]/route.ts + UI components: + - src/shared/components/layout/Navbar.tsx + - src/shared/components/layout/Footer.tsx + Styles: src/app/globals.css, tailwind.config.ts + Path convention for fs_*: paths are relative to /workspace//. + 'src/app/page.tsx' is correct. Do NOT prefix with the slug. + ``` + - On a fresh project before scaffold, return a short placeholder: + `[CODEBASE — /workspace//] Empty repo; ready for scaffold.` + +2. **Cache the codebase summary aggressively** — recompute only when + `package.json` or directory structure changes (track mtime). The + current implementation likely awaits a fresh computation on every + turn, which is both slow and a source of "summary unavailable" stub + returns when the dev container isn't fully clone-ready yet. + +3. **Front-load the audience/vision/brief.** In `buildSystemPrompt`, + move the project-identity block (name, audience, vision, brief + snippet) to the TOP of the system prompt, BEFORE the long + "How Vibn is structured / Common questions → tools / How to deploy" + section. The current ordering buries the most-decision-relevant + context after ~3000 chars of operational scaffolding. + +4. **Add an explicit "if you don't know the codebase, read this first" + reminder** at the top of the codebase block: + ``` + [CODEBASE] This block tells you the repo layout. You should NOT + need fs_tree or fs_glob "**/page.tsx" to find common files — they're + listed below. Use fs_tree only when you suspect the layout has + changed or you need to explore a subdirectory not listed here. + ``` + +### Verification + +- After this fix, replay a "modify the homepage" request against a + populated project. The agent should call `fs_read` on `src/app/page.tsx` + directly with the correct relative path, with no preceding `fs_tree` + or `fs_glob` discovery calls. +- Inspect the rendered system prompt for a fresh project-scoped thread. + Confirm the `[CODEBASE — ...]` block is present, lists at least the + app entry points, and explicitly states the path convention. +- Replay the failing session's first user message ("apply your design + skills to the home page"). The agent should reference the project's + audience (SMB / Main Street buyers) in its first response, without + needing a `plan_get` call, because the brief is now front-loaded. + +--- + +## Rollout order + +1. **Fix 11 (codebase summary populated and front-loaded)** — landed + first because it eliminates the 10+ redundant discovery calls per + turn and gives the agent ground truth about path conventions before + Fix 1's enforcement starts rejecting wrong-path writes. +2. Fix 1 (path scoping enforcement) — eliminates the ghost-file class + of bug entirely. Should land alongside Fix 11 so the agent knows + what the rules are before they start rejecting writes. +3. Fix 2 (verification data in tool results) — lands the schema the rest + of the fixes depend on. +4. Fix 3 (history rewrite with tool summaries) — only valuable AFTER Fix + 2 is producing sha256/healthCheck fields. +5. Fix 8 (hard-rule prompt clauses citing sha256/healthCheck) — depends + on Fixes 2 and 3. +6. Fix 4 (semantic loop detection) — independent, can ship anytime. +7. Fix 5 (silent-stretch nudge threshold) — independent. +8. Fix 6 (recovery on tool failure) — depends on Fix 2 for `ok` and + `healthCheck` fields. +9. Fix 7 (lower MAX_TOOL_ROUNDS to 15) — independent, ship last after + confirming legitimate workflows still fit. +10. Fix 9 (auto-extraction confidence tiers) — independent. +11. Fix 10 (sync commit with streamed result) — independent. + +After all eleven land, replay the failing session's prompts against the +new harness and confirm: +- The agent references the project's audience and brief in its first + response, without a `plan_get` call (Fix 11). +- No redundant `fs_tree` / `fs_glob "**/page.tsx"` discovery sequences + (Fix 11). +- No "ghost file" path scoping errors (Fix 1). +- No "I just rewrote page.tsx" claim without a citable sha256 (Fixes 2, + 3, 8). +- No 20+ round runaway loops (Fixes 4, 7). +- No silent stretches >2 rounds (Fix 5). +- Failed `dev_server_start` calls produce honest recovery prose, not + "preview is ready" hallucinations (Fixes 6, 8). \ No newline at end of file diff --git a/vibn-frontend/app/api/chat/route.ts b/vibn-frontend/app/api/chat/route.ts index f73e0d8..d73129f 100644 --- a/vibn-frontend/app/api/chat/route.ts +++ b/vibn-frontend/app/api/chat/route.ts @@ -37,7 +37,7 @@ import type { ChatMessage, ToolCall } from "@/lib/ai/gemini-chat"; // gives enough headroom for complex workflows (scaffold → install → // configure → start) while still capping runaway loops. When the cap // IS hit, we emit a recovery summary instead of silent tool pills. -const MAX_TOOL_ROUNDS = 30; +const MAX_TOOL_ROUNDS = 15; let chatTablesReady = false; async function ensureChatTables() { @@ -327,6 +327,40 @@ export async function POST(request: Request) { const history: ChatMessage[] = rows.reverse().map((r: any) => { const msg = r.data; if (msg.role === "assistant" && msg.toolCalls?.length) { + const rawResults = msg._rawToolResults ?? []; + const summary = msg.toolCalls + .map((tc: any) => { + const tr = rawResults.find((r: any) => r.name === tc.name); + let resultSig = "(no result captured)"; + if (tr) { + try { + const parsed = + typeof tr.result === "string" + ? JSON.parse(tr.result) + : tr.result; + if (parsed && typeof parsed === "object") { + if (parsed.ok === false) { + resultSig = `ERROR: ${parsed.error ?? "unknown"}`; + } else if (parsed.sha256) { + resultSig = `ok bytes=${parsed.bytes} sha=${parsed.sha256.slice(0, 8)}`; + } else if (parsed.previewUrl) { + resultSig = `ok previewUrl=${parsed.previewUrl} health=${parsed.healthCheck?.status ?? "?"}`; + } else if (parsed.uuid) { + resultSig = `ok uuid=${parsed.uuid}`; + } else { + resultSig = "ok"; + } + } + } catch { + resultSig = String(tr.result).slice(0, 80); + } + } + const argSig = JSON.stringify(tc.args ?? {}).slice(0, 100); + return ` - ${tc.name}(${argSig}) → ${resultSig}`; + }) + .join("\n"); + const suffix = `\n\n[tools executed this turn:\n${summary}\n]`; + msg.content = (msg.content ?? "") + suffix; msg.toolCalls = undefined; } if (typeof msg.content === "string") {