diff --git a/fix_mcp_fs.js b/fix_mcp_fs.js new file mode 100644 index 0000000..ea9eaac --- /dev/null +++ b/fix_mcp_fs.js @@ -0,0 +1,48 @@ +const fs = require('fs'); + +const file = 'vibn-frontend/app/api/mcp/route.ts'; +let code = fs.readFileSync(file, 'utf8'); + +// 1. Fix toolFsWrite +const oldFsWriteReturn = ` return NextResponse.json({ + result: { path, bytesWritten: Buffer.byteLength(content, "utf8") }, + });`; + +const newFsWriteReturn = ` const { createHash } = require('crypto'); + const bytes = Buffer.byteLength(content, "utf8"); + const sha256 = createHash("sha256").update(content, "utf8").digest("hex"); + return NextResponse.json({ + result: { ok: true, path, bytes, sha256 }, + });`; + +code = code.replace(oldFsWriteReturn, newFsWriteReturn); + + +// 2. Fix toolFsEdit +const oldFsEditCmd = `const cmd = \`python3 -c "$(printf %s \${shq(pyB64)} | base64 -d)" <<< "$(printf %s \${shq(b64)} | base64 -d)"\`;`; +const newFsEditCmd = `const cmd = \`python3 -c "$(printf %s \${shq(pyB64)} | base64 -d)" <<< "$(printf %s \${shq(b64)} | base64 -d)" && echo "---" && sha256sum \${shq(path)} | cut -d' ' -f1 && wc -c < \${shq(path)}\`;`; + +code = code.replace(oldFsEditCmd, newFsEditCmd); + +const oldFsEditReturn = ` return NextResponse.json({ + result: { path, replacements: parseInt(r.stdout.trim() || "0", 10) }, + });`; + +const newFsEditReturn = ` const stdoutParts = r.stdout.split('---'); + const replacementsStr = stdoutParts[0].trim(); + const hashAndSize = stdoutParts[1] ? stdoutParts[1].trim().split('\\n') : []; + + return NextResponse.json({ + result: { + ok: true, + path, + replacements: parseInt(replacementsStr || "0", 10), + sha256: hashAndSize[0] ? hashAndSize[0].trim() : undefined, + bytes: hashAndSize[1] ? parseInt(hashAndSize[1].trim(), 10) : undefined + }, + });`; + +code = code.replace(oldFsEditReturn, newFsEditReturn); + +fs.writeFileSync(file, code); +console.log("Patched toolFsWrite and toolFsEdit"); diff --git a/fix_mcp_fs_normalize.js b/fix_mcp_fs_normalize.js new file mode 100644 index 0000000..b3ab3ce --- /dev/null +++ b/fix_mcp_fs_normalize.js @@ -0,0 +1,82 @@ +const fs = require('fs'); + +const file = 'vibn-frontend/app/api/mcp/route.ts'; +let code = fs.readFileSync(file, 'utf8'); + +const oldNormalize = `function normalizeFsPath(p: string): string | NextResponse { + if (!p || typeof p !== "string") { + return NextResponse.json( + { error: 'Param "path" is required' }, + { status: 400 }, + ); + } + let abs: string; + if (p.startsWith("/")) { + abs = p; + } else { + abs = \`\${FS_ROOT}/\${p}\`.replace(/\\/+/g, "/"); + } + // Disallow .. traversal that escapes /workspace. + const norm = abs.replace(/\\/[^/]+\\/\\.\\.(?=\\/|$)/g, "").replace(/\\/+/g, "/"); + if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) { + return NextResponse.json( + { + error: \`Path "\${p}" is outside \${FS_ROOT}; use shell.exec for system paths.\`, + }, + { status: 400 }, + ); + } + return norm; +}`; + +const newNormalize = `function normalizeFsPath( + p: string, + projectSlug?: string, +): string | NextResponse { + if (!p || typeof p !== "string") { + return NextResponse.json( + { error: 'Param "path" is required' }, + { status: 400 }, + ); + } + const projectRoot = projectSlug ? \`\${FS_ROOT}/\${projectSlug}\` : FS_ROOT; + let abs: string; + if (p.startsWith("/")) { + abs = p; + } else { + abs = \`\${projectRoot}/\${p}\`.replace(/\\/+/g, "/"); + } + const norm = abs.replace(/\\/[^/]+\\/\\.\\.(?=\\/|$)/g, "").replace(/\\/+/g, "/"); + + // When projectSlug is set, REJECT paths outside the project root. + if (projectSlug) { + if (!norm.startsWith(projectRoot) && norm !== projectRoot) { + return NextResponse.json( + { + ok: false, + error: \`PATH_OUTSIDE_PROJECT: path "\${p}" resolves to "\${norm}" which is outside the active project at "\${projectRoot}". Did you mean "\${projectRoot}/\${p.replace(/^\\/+/, "")}"?\`, + }, + { status: 400 }, + ); + } + } else { + // Workspace-level fallback (legacy behaviour) + if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) { + return NextResponse.json( + { error: \`Path "\${p}" is outside \${FS_ROOT}; use shell.exec for system paths.\` }, + { status: 400 }, + ); + } + } + return norm; +}`; + +code = code.replace(oldNormalize, newNormalize); + +code = code.replace(/const path = normalizeFsPath\(String\(params\.path \?\? ""\)\);/g, 'const path = normalizeFsPath(String(params.path ?? ""), project.slug);'); +code = code.replace(/const path = normalizeFsPath\(String\(params\.path \?\? "\/workspace"\)\);/g, 'const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug);'); +code = code.replace(/const cwd = normalizeFsPath\(String\(params\.cwd \?\? "\/workspace"\)\);/g, 'const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug);'); +code = code.replace(/const targetPath = normalizeFsPath\(String\(params\.targetPath \?\? ""\)\);/g, 'const targetPath = normalizeFsPath(String(params.targetPath ?? ""), project.slug);'); + +fs.writeFileSync(file, code); +console.log("Patched normalizeFsPath with projectSlug scoping"); diff --git a/may-19-ai-review.md b/may-19-ai-review.md new file mode 100644 index 0000000..ade4149 --- /dev/null +++ b/may-19-ai-review.md @@ -0,0 +1,601 @@ +# Vibn Chat Harness — Fix Checklist + +Work through items in order. Each fix has a clear **What**, **Where**, +**How**, and **Verify** section. Don't skip the verify step — many of +these fixes interact with each other and silent failures will +compound. + +Mark `[x]` as you complete each item. If you can't complete an item, +add a short note under it explaining why and move on. + +--- + +## Phase 1 — Backend fixes (highest leverage; do these first) + +These three fix the failure modes the prompt currently promises but +the backend doesn't deliver. Until they're done, the prompt's hard +rules are partly fiction. + +### [ ] 1. Add `sha256` and `bytes` to `fs.write` and `fs.edit` responses + +**What:** The prompt's hard rules tell the model to cite `sha256` and +`bytes` as evidence of file changes. The tools don't return those +fields today, so the model is looking for evidence that doesn't exist. + +**Where:** `app/api/mcp/route.ts` — functions `toolFsWrite` and +`toolFsEdit`. + +**How:** +- In `toolFsWrite`, after the `runFsCmd` success branch, before + returning, compute the sha256 of `content` and return it alongside + `bytesWritten` renamed to `bytes`: + ```ts + import { createHash } from "crypto"; + // ... + const bytes = Buffer.byteLength(content, "utf8"); + const sha256 = createHash("sha256").update(content, "utf8").digest("hex"); + return NextResponse.json({ + result: { ok: true, path, bytes, sha256 }, + }); + ``` +- In `toolFsEdit`, you don't have the final content in memory. Add a + second command that prints the sha + bytes after the edit: + ```ts + const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)" && echo "---" && sha256sum ${shq(path)} | cut -d' ' -f1 && wc -c < ${shq(path)}`; + ``` + Then parse the trailing two lines after `---` for sha and bytes. +- Update the response shape: + ```ts + return NextResponse.json({ + result: { ok: true, path, replacements, bytes, sha256 }, + }); + ``` + +**Verify:** +- [ ] Call `fs_write` with `{ path: "test.txt", content: "hello" }`. + Confirm response contains `sha256` (64 hex chars) and `bytes: 5`. +- [ ] Call `fs_edit` to change the same file. Confirm response + contains a new `sha256` and updated `bytes`. +- [ ] Replay a turn that does `fs_write` followed by `fs_read` of the + same file in chat. The model should now produce text like + "Updated `test.txt` (sha=a3f5c2…, 5b)" instead of a bare claim. + +--- + +### [ ] 2. Add project-slug scoping to `normalizeFsPath` + +**What:** The prompt tells the model to use paths like `src/app/page.tsx` +and claims the tool layer rejects writes outside the project root. +The tool layer does NOT do this today. It resolves all relative paths +under `/workspace` (workspace-level), so `fs_write { path: "src/app/page.tsx" }` +ends up at `/workspace/src/app/page.tsx` — the ghost file from the +failing session. Five different path conventions were used for the +same file in one session because nothing enforces the rule. + +**Where:** `app/api/mcp/route.ts` — function `normalizeFsPath` and +every caller in `toolFsRead`, `toolFsWrite`, `toolFsEdit`, +`toolFsList`, `toolFsDelete`, `toolFsGlob`, `toolFsGrep`, `toolFsTree`, +`toolRequestVisualQA`, `toolGenerateMedia`. + +**How:** + +- Change `normalizeFsPath` to accept an optional `projectSlug`: + ```ts + function normalizeFsPath( + p: string, + projectSlug?: string, + ): string | NextResponse { + if (!p || typeof p !== "string") { + return NextResponse.json( + { error: 'Param "path" is required' }, + { status: 400 }, + ); + } + const projectRoot = projectSlug ? `${FS_ROOT}/${projectSlug}` : FS_ROOT; + let abs: string; + if (p.startsWith("/")) { + abs = p; + } else { + abs = `${projectRoot}/${p}`.replace(/\/+/g, "/"); + } + const norm = abs.replace(/\/[^/]+\/\.\.(?=\/|$)/g, "").replace(/\/+/g, "/"); + + // When projectSlug is set, REJECT paths outside the project root. + if (projectSlug) { + if (!norm.startsWith(projectRoot) && norm !== projectRoot) { + return NextResponse.json( + { + ok: false, + error: `PATH_OUTSIDE_PROJECT: path "${p}" resolves to "${norm}" which is outside the active project at "${projectRoot}". Did you mean "${projectRoot}/${p.replace(/^\/+/, "")}"?`, + }, + { status: 400 }, + ); + } + } else { + // Workspace-level fallback (legacy behaviour) + if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) { + return NextResponse.json( + { error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.` }, + { status: 400 }, + ); + } + } + return norm; + } + ``` +- In every fs_* tool that already calls `resolveProjectOr404`, pass + the slug: + ```ts + const path = normalizeFsPath(String(params.path ?? ""), project.slug); + ``` +- `toolFsRead`, `toolFsWrite`, `toolFsEdit`, `toolFsDelete`, + `toolRequestVisualQA`, `toolGenerateMedia` all already have + `project` in scope — pass `project.slug`. +- `toolFsList`, `toolFsGlob`, `toolFsGrep`, `toolFsTree` use + `params.path` or `params.cwd` — same treatment, pass `project.slug`. + +**Verify:** +- [ ] From a project-scoped thread, call `fs_write { path: "/workspace/src/app/page.tsx", content: "x" }`. + Expect `PATH_OUTSIDE_PROJECT` error. +- [ ] From the same thread, call `fs_write { path: "src/app/page.tsx", content: "x" }`. + Expect success at `/workspace//src/app/page.tsx`. +- [ ] Confirm `dev_server_start` with `command: "npm run dev"` runs + from the project root, not `/workspace`. (This is mostly already + true via dev-container logic; just confirm.) + +--- + +### [ ] 3. Fix the broken `plan-extract` block + +**What:** The fire-and-forget `plan-extract` block in +`app/api/chat/route.ts` has a syntax error — the `try` block builds a +transcript and then hits `}` followed by `catch` with no actual call +to `autoExtractPlanUpdates`. The body of the try is missing. Either +the auto-extraction was intentionally removed (in which case the +dead transcript-building code should also be deleted) or it was +accidentally truncated (in which case the call needs to be restored). + +**Where:** `app/api/chat/route.ts`, around the second fire-and-forget +block (after the title/summary block, before `emit({ type: "done" })`). + +**How:** + +- Decide first: do we want auto-extraction to run? If YES, restore + the call: + ```ts + (async () => { + try { + if (!threadProjectId) return; + const allMessages = [...history, finalMsg]; + if (allMessages.length < 2) return; + const transcript = allMessages + .map((m) => { + const text = + typeof m.content === "string" + ? m.content + : JSON.stringify(m.content); + return `${m.role.toUpperCase()}: ${text.slice(0, 1200)}`; + }) + .join("\n\n"); + const result = await autoExtractPlanUpdates( + threadProjectId, + transcript, + ); + if (result) { + console.log( + "[chat] plan-extract:", + `${result.tasks} tasks, ${result.decisions} decisions, vision=${result.vision}`, + ); + } + } catch (err) { + console.warn("[chat] plan-extract failed (non-fatal):", err); + } + })().catch(() => {}); + ``` + And re-add the `import { autoExtractPlanUpdates } from "@/lib/ai/plan-extract";` + at the top of the file. +- If NO (you removed it intentionally), delete the entire IIFE + including the transcript-building so the file compiles cleanly. + +**Verify:** +- [ ] Run `tsc --noEmit` on the file. Confirm no syntax errors. +- [ ] If auto-extraction restored: have a chat that mentions a + decision (e.g. "let's use Postgres"). Confirm a new entry appears + in the project's `plan.decisions` with `confidence: "auto"`. +- [ ] Tail prod logs for `[chat] plan-extract:` — should fire on + every turn with content. + +--- + +## Phase 2 — Prompt fixes (now that the backend matches) + +These bring the prompt into line with what the tools actually do. + +### [ ] 4. Fix the `apps_containers_list` typo in the prompt + +**What:** The troubleshooting section references `apps_containers_list` +but the actual tool is `apps_containers_ps`. The model will call a +tool that doesn't exist. + +**Where:** `app/api/chat/route.ts`, inside `buildSystemPrompt`, in the +"## Troubleshooting" section. + +**How:** +- Find: `apps_logs { uuid } + apps_containers_list { uuid }` +- Replace: `apps_logs { uuid } + apps_containers_ps { uuid }` + +**Verify:** +- [ ] Grep the prompt for `apps_containers_list` — no matches. +- [ ] Grep for `apps_containers_ps` — should appear in + troubleshooting and at least once in the apps section. + +--- + +### [ ] 5. Soften the `ok` field rule + +**What:** The current rule says "If `ok` is false (or absent, or +`exitCode` is non-zero, or `healthCheck.status` is >= 400) the +operation FAILED." The "or absent" clause is wrong — many tools +return data without an `ok` field (e.g. `projects_get`, `apps_list`, +`databases_get`). The model will treat every read as a failure. + +**Where:** `buildSystemPrompt`, "Hard rules" section, the "Trust the +`ok` field" bullet. + +**How:** + +Replace the current rule with: + +``` +- **Read tool results carefully.** A tool FAILED when ANY of these + signals are present: `ok: false`, `error: "..."`, a non-zero + `exitCode`, or a `healthCheck.status` >= 400. If NONE of those + signals are present, look at the actual content of the response + to decide whether the operation succeeded. Many read-only tools + return data directly without an `ok` field — that's not a failure. +``` + +**Verify:** +- [ ] Pick a recent thread where the agent called `projects_get` or + `apps_list`. Confirm the agent didn't treat the response as a + failure (look at its post-tool text — should be a normal summary, + not "the operation failed"). + +--- + +### [ ] 6. Tighten the status-nudge threshold + +**What:** Current thresholds are `roundsSinceText >= 8 || +toolCallsSinceText >= 12`. With `MAX_TOOL_ROUNDS = 8`, the round-based +nudge can never fire (loop ends first). The tool-call threshold of 12 +is also too lenient — users typed "test" / "hello" by round 4-5 of +silence in the failing session. + +**Where:** `app/api/chat/route.ts`, near the top of the main while +loop, the `isSilent` constant. + +**How:** + +```ts +const isSilent = roundsSinceText >= 3 || toolCallsSinceText >= 6; +``` + +**Verify:** +- [ ] Replay a chat that triggers 6+ tool calls without text. Confirm + the `[STATUS NUDGE]` system addendum is injected before the next + round. +- [ ] Confirm the model produces a one-line status sentence in + response to the nudge. + +--- + +### [ ] 7. Update the path-convention guidance in the prompt + +**What:** After Fix 2 ships, the path convention is now enforced. The +prompt should state this plainly without the "cd into your project" +workaround. + +**Where:** `buildSystemPrompt`, inside `activeBlock`, the path +guidance section. Also inside "Dev servers" → "Directory" bullet. + +**How:** + +Replace the "Directory" bullet under "Dev servers": + +``` +- **Directory:** Tool paths are scoped to your project root + automatically. Pass `command: "npm run dev"` directly — no `cd` + prefix needed. The tool rejects any `fs_*` write outside + `/workspace//`. +``` + +And after the "Project repo is auto-cloned" paragraph in +`activeBlock`, add: + +``` +**Path convention for fs_* tools:** Pass paths relative to the +project root — `src/app/page.tsx`, NOT `/workspace//src/app/page.tsx` +and NOT `/src/app/page.tsx`. The tool layer rejects writes +outside the project root with a `PATH_OUTSIDE_PROJECT` error +suggesting the corrected path. +``` + +**Verify:** +- [ ] In a fresh chat, ask the agent to "edit the homepage". Confirm + the first `fs_read` call uses `src/app/page.tsx` (no slug prefix, + no `/workspace/` prefix). + +--- + +## Phase 3 — Polish and safety nets + +These are lower-priority but each removes a small foot-gun. + +### [ ] 8. Add `fs_tree` recommendation to first-turn behavior + +**What:** The agent ran `fs_tree` 5 times and `fs_glob` 9+ times in +the failing session, re-discovering paths it should have learned once. +The tool description already says "ALWAYS call this first" but the +prompt doesn't reinforce it. + +**Where:** `buildSystemPrompt`, in the "Writing code" section. + +**How:** + +Add this near the top of "Writing code — dev container is the default": + +``` +**Orient yourself once.** On the first code-modifying turn of a +chat, call `fs_tree` once to learn the repo layout. Don't re-run it +on every turn — the layout doesn't change between user messages. +``` + +**Verify:** +- [ ] Manual review of the next 5 sessions: confirm `fs_tree` is + called at most once per chat (not per turn). + +--- + +### [ ] 9. Add `browser_navigate` and `browser_console` as verification primitives + +**What:** The backend has `browser_navigate` and `browser_console` +tools that headlessly render a page and capture console errors. The +prompt never mentions them. These are the missing post-deploy +verification step that the `healthCheck` field gestures at. + +**Where:** `buildSystemPrompt`, in the "Dev servers" subsection or as +its own section after "Visual QA". + +**How:** + +Add a new bullet right after "Visual QA" guidance: + +``` +**Verify the page actually renders:** +- After `dev_server_start` returns a `previewUrl` AND `healthCheck.status === 200`, + for any UI-facing turn, call `browser_console { url: previewUrl }` to + capture frontend console errors. Hydration errors, missing assets, + and uncaught exceptions show up here even when the server is + technically "running". +- If `browser_console` returns errors, fix them with `fs_edit` + before declaring done. A green `healthCheck` plus a clean console + is the real "done" signal for UI work. +- Skip this for backend / SQL / config-only changes. +``` + +**Verify:** +- [ ] On the next UI-modifying chat, confirm `browser_console` is + called once after `dev_server_start`. +- [ ] Confirm any errors it returns get acknowledged in the agent's + reply. + +--- + +### [ ] 10. Add the market research stack to the prompt + +**What:** The backend exposes six market research tools +(`market_categories_suggest`, `market_research_run`, `market_seo_analyze`, +`tech_stack_analyze`, `market_competitor_research`, +`market_aggregate_insights`). The prompt never mentions them. For a +non-technical-founder product, these are some of the highest-leverage +tools — they answer "should I build for dentists or summer camps?" +with real TAM counts. + +**Where:** `buildSystemPrompt`, add a new section after "Common +questions → tools" or before "How to deploy". + +**How:** + +Add this section: + +``` +## Helping the user pick what to build + +Vibn has a market-research toolkit for non-technical founders who +need data on their target niche. Use it when the user is undecided, +validating an idea, or comparing markets: + +- **"How big is the market for X in ?"** → `market_categories_suggest { niche }` to + propose Google Business categories, then `market_research_run` after + the user approves. Returns TAM count, sample domains, and review + data. NOTE: `market_research_run` costs real money — always confirm + with the user and pass `user_explicitly_approved: true`. +- **"What are competitors spending on Google Ads?"** → `market_seo_analyze { domain }`. + Returns organic traffic, paid traffic, ad spend, and top paid + keywords. Use to tell the user how aggressive a market is. +- **"What software do these businesses already use?"** → `tech_stack_analyze { urls, software_category_id }`. + Detects WordPress, Shopify, named competitors, and any custom + domains/scripts you pass. Use to find "X businesses use WordPress + but lack Y" market gaps. +- **"What are customers complaining about?"** → `market_aggregate_insights { category, location }`. + Returns top review topics — use the actual words customers use as + marketing copy and value-prop seeds. +- **"Who are the players in this niche?"** → `market_competitor_research { niche }`. + Returns proprietary competitors with pricing AND open-source + alternatives that could be forked. + +These are conversational research tools — they don't build anything. +Use them BEFORE scaffolding when the user is exploring direction; +SKIP them once the user has committed to building. +``` + +**Verify:** +- [ ] In a fresh chat, ask the agent "should I build for dentists or + summer camps?". Confirm it proposes using `market_categories_suggest` + or `market_aggregate_insights` rather than guessing. + +--- + +### [ ] 11. Surface `apps_exec`, `auth_create`, `generate_media`, `storage_*` briefly + +**What:** Several capabilities the agent has are completely absent +from the prompt. The agent doesn't know it can run commands inside +production containers, deploy real auth servers, generate images, or +wire S3 storage. One-line mentions are enough. + +**Where:** `buildSystemPrompt`, scattered across existing sections. + +**How:** + +- Under "Common questions → tools", add: + ``` + - "Run a migration / psql in prod" → `apps_exec { uuid, command }`. + - "Generate a hero image / illustration" → `generate_media { prompt, type, outputPath }`. + - "Wire up file storage / uploads" → `storage_provision` (if not already), then `storage_inject_env { uuid }`. + ``` + +- In the "Decision defaults" section, augment the auth bullet: + ``` + - **Auth:** NextAuth with email magic-link for in-app auth. + Deploy a separate Pocketbase / Authentik / Keycloak service via + `auth_create { provider }` only if the user needs SSO, multi-app + SSO, or admin user management. + ``` + +**Verify:** +- [ ] Ask the agent "can you run a SQL migration on prod?". Confirm + it references `apps_exec`. +- [ ] Ask "I need a hero image for the landing page". Confirm it + references `generate_media`. + +--- + +### [ ] 12. Flip the `fs_edit` preference to match the tool's documentation + +**What:** The tool description says `startLine`/`endLine` is +preferred and `oldString` is the fallback. The prompt currently says +the opposite ("prefer `oldString` for small replacements"). The tool +is authoritative — match it. + +**Where:** `buildSystemPrompt`, in the "Iterate" bullet under "Writing +code", the `fs_edit` guidance. + +**How:** + +Replace the current `fs_edit` guidance with: + +``` +- `fs_read` / `fs_write` / `fs_edit { path, oldString, newString, startLine, endLine }`. + **For `fs_edit`:** prefer `startLine`/`endLine` (deterministic; never + fails on duplicate strings). Use `oldString` only when you cannot + read the file first to get line numbers — and when you do, include + 2-3 lines of surrounding context for uniqueness. If `fs_edit` keeps + failing, do NOT escape to `shell_exec` with patch scripts — read + the file fresh with `fs_read`, get the line numbers, and try again. +``` + +**Verify:** +- [ ] Confirm next 3 `fs_edit` calls in the wild use `startLine`/`endLine`, + not `oldString`. + +--- + +## Phase 4 — Monitoring (after Phases 1-3 land) + +Once the changes are in production, watch these for a week. Tune if +the numbers don't move. + +### [ ] 13. Track the recovery-summary fire rate + +**What:** The `needsRecovery` path runs when a turn ends badly (hit +the round cap, hit a loop, or last tool returned failure). It should +fire on <10% of turns. If it fires more often, the cap is too low or +the model is hitting real bugs. + +**How:** Add a metric. In the `needsRecovery` block, before calling +`callVibnChat`, emit: + +```ts +console.log("[chat] recovery_fired", { + turnId, + reason: loopBreakReason ? "loop" + : round >= MAX_TOOL_ROUNDS ? "round_cap" + : assistantText.trim().length === 0 ? "no_text" + : "tool_failure", + toolCalls: assistantToolCalls.length, +}); +``` + +**Verify:** +- [ ] Aggregate over 1 week. If `round_cap` is the dominant reason + and the turns look legitimate, raise `MAX_TOOL_ROUNDS` to 10 or 12. +- [ ] If `loop` is dominant, the fingerprinter may need tuning. + +--- + +### [ ] 14. Track conversational-guard fire rate + +**What:** The first-turn conversational guard (regex match on user +message → no tools on round 1) is the biggest single behavioral +change in this revision. We want to know how often it fires and +whether it ever causes a problem. + +**How:** Before the loop, after computing `firstMessageIsConversational`: + +```ts +console.log("[chat] turn_start", { + turnId, + firstMessageIsConversational, + messagePreview: message.trim().slice(0, 80), +}); +``` + +**Verify:** +- [ ] Aggregate over 1 week. Should fire on ~30-40% of first messages + in a chat. +- [ ] Spot-check 10 cases where it fired: confirm none were + legitimate "build me X" requests being miscategorized. + +--- + +### [ ] 15. Track `PATH_OUTSIDE_PROJECT` rejections + +**What:** After Fix 2 ships, this error tells you how often the model +was about to write to the wrong path. Should taper toward zero as the +prompt guidance + enforcement settles in. + +**How:** Server-side log in `normalizeFsPath` when the project-scoped +rejection fires. + +**Verify:** +- [ ] Week 1: count rejections per day. +- [ ] Week 2: count should be lower (model has internalized the rule + via the error messages it's seen). + +--- + +## What this checklist deliberately doesn't include + +A few things from earlier reviews that I'm intentionally leaving off: + +- **Phase-aware behavior.** Phases were removed from the product; the + prompt no longer references them. No work needed. +- **Codebase summary auto-generation.** Lower priority once Fix 2 + ships (paths can no longer drift) and Fix 8 lands (one `fs_tree` + per chat instead of nine). +- **Tool history reconstruction for DeepSeek compatibility.** Already + shipped in the current code via the `_rawToolResults` compact + summary. No additional work. + +Don't add work for these unless a clear failure mode appears in +production after Phases 1-3 land. \ No newline at end of file diff --git a/patch_prompts.js b/patch_prompts.js new file mode 100644 index 0000000..02f1132 --- /dev/null +++ b/patch_prompts.js @@ -0,0 +1,73 @@ +const fs = require('fs'); + +const file = 'vibn-frontend/app/api/chat/route.ts'; +let code = fs.readFileSync(file, 'utf8'); + +// Fix 4: apps_containers_list -> apps_containers_ps +code = code.replace(/apps_containers_list \{ uuid \}/g, 'apps_containers_ps { uuid }'); + +// Fix 5: Soften ok field rule +const oldOkRule = `- **Trust the \\\`ok\\\` field.** Every tool result carries \\\`ok: true | false\\\`. If \\\`ok\\\` is false (or \\\`exitCode\\\` is non-zero, or \\\`healthCheck.status\\\` is >= 400), the operation FAILED. Do not describe a failed operation as successful. Surface the error verbatim and propose a next step.`; +const newOkRule = `- **Read tool results carefully.** A tool FAILED when ANY of these signals are present: \\\`ok: false\\\`, \\\`error: "..."\\\`, a non-zero \\\`exitCode\\\`, or a \\\`healthCheck.status\\\` >= 400. If NONE of those signals are present, look at the actual content of the response to decide whether the operation succeeded. Many read-only tools return data directly without an \\\`ok\\\` field — that's not a failure.`; +code = code.replace(oldOkRule, newOkRule); + +// Fix 7: Path conventions +const oldDirRule = `- **Directory:** the tool resolves paths relative to the active project root — you can pass \\\`command: "npm run dev"\\\` directly. (If you need to manually \\\`cd\\\`, use the project slug.)`; +const newDirRule = `- **Directory:** Tool paths are scoped to your project root automatically. Pass \\\`command: "npm run dev"\\\` directly — no \\\`cd\\\` prefix needed. The tool rejects any \\\`fs_*\\\` write outside \\\`/workspace//\\\`.`; +code = code.replace(oldDirRule, newDirRule); + +const oldPathConv = `**fs_* path convention for this project:** Pass paths relative to \\\`/workspace/\${activeProject.slug ?? ""}/\\\` — e.g. \\\`src/app/page.tsx\\\`, not \\\`/workspace/\${activeProject.slug ?? ""}/src/app/page.tsx\\\` and not \\\`getacquired-style/src/app/page.tsx\\\`. The tool layer rejects paths outside the project root.`; +const newPathConv = `**Path convention for fs_* tools:** Pass paths relative to the project root — \\\`src/app/page.tsx\\\`, NOT \\\`/workspace/\${activeProject.slug ?? ""}/src/app/page.tsx\\\` and NOT \\\`\${activeProject.slug ?? ""}/src/app/page.tsx\\\`. The tool layer rejects writes outside the project root with a \\\`PATH_OUTSIDE_PROJECT\\\` error suggesting the corrected path.`; +code = code.replace(oldPathConv, newPathConv); + +// Fix 8: fs_tree recommendation +const devContainerStart = `**Start a coding session:** \\\`devcontainer_ensure { projectId }\\\` (idempotent; first call ~10s, then instant).`; +const devContainerStartNew = `**Start a coding session:** \\\`devcontainer_ensure { projectId }\\\` (idempotent; first call ~10s, then instant). + +**Orient yourself once.** On the first code-modifying turn of a chat, call \\\`fs_tree\\\` once to learn the repo layout. Don't re-run it on every turn — the layout doesn't change between user messages.`; +code = code.replace(devContainerStart, devContainerStartNew); + +// Fix 9: browser_console +const visualQaBlock = `**Visual QA:** \\\`request_visual_qa { targetPath }\\\` critiques a UI file against a 5-dim design rubric. **Call this whenever you modify visual UI code** before returning the \\\`previewUrl\\\`. If it returns actionable issues, fix them with \\\`fs_edit\\\` before ending the turn. Skip for backend / SQL / config / non-visual changes.`; +const visualQaBlockNew = `**Visual QA:** \\\`request_visual_qa { targetPath }\\\` critiques a UI file against a 5-dim design rubric. **Call this whenever you modify visual UI code** before returning the \\\`previewUrl\\\`. If it returns actionable issues, fix them with \\\`fs_edit\\\` before ending the turn. Skip for backend / SQL / config / non-visual changes. + +**Verify the page actually renders:** +- After \\\`dev_server_start\\\` returns a \\\`previewUrl\\\` AND \\\`healthCheck.status === 200\\\`, for any UI-facing turn, call \\\`browser_console { url: previewUrl }\\\` to capture frontend console errors. Hydration errors, missing assets, and uncaught exceptions show up here even when the server is technically "running". +- If \\\`browser_console\\\` returns errors, fix them with \\\`fs_edit\\\` before declaring done. A green \\\`healthCheck\\\` plus a clean console is the real "done" signal for UI work. +- Skip this for backend / SQL / config-only changes.`; +code = code.replace(visualQaBlock, visualQaBlockNew); + +// Fix 10: Market research stack +const commonQuestionsBlock = `## Common questions → tools +- "What is project X?" → \\\`projects_get { projectId }\\\` +- "What's running / has a domain?" → \\\`apps_list { projectId }\\\` (or workspace-wide without projectId) +- "Show logs / containers / env" → \\\`apps_list\\\` to resolve uuid, then \\\`apps_logs\\\` / \\\`apps_containers_list\\\` / \\\`apps_envs_list\\\` +- "Find an OSS X" → \\\`github_search\\\` (include \\\`license:mit\\\` by default), then \\\`github_file\\\` to read README / docker-compose +- "What do the docs say about Y?" → \\\`http_fetch\\\``; + +// Oops, we changed apps_containers_list to apps_containers_ps in Fix 4, so let's match the updated one: +const commonQuestionsBlockMatched = `## Common questions → tools +- "What is project X?" → \\\`projects_get { projectId }\\\` +- "What's running / has a domain?" → \\\`apps_list { projectId }\\\` (or workspace-wide without projectId) +- "Show logs / containers / env" → \\\`apps_list\\\` to resolve uuid, then \\\`apps_logs\\\` / \\\`apps_containers_ps\\\` / \\\`apps_envs_list\\\` +- "Find an OSS X" → \\\`github_search\\\` (include \\\`license:mit\\\` by default), then \\\`github_file\\\` to read README / docker-compose +- "What do the docs say about Y?" → \\\`http_fetch\\\``; + +const marketResearchBlock = ` + +## Helping the user pick what to build + +Vibn has a market-research toolkit for non-technical founders who need data on their target niche. Use it when the user is undecided, validating an idea, or comparing markets: + +- **"How big is the market for X in ?"** → \\\`market_categories_suggest { niche }\\\` to propose Google Business categories, then \\\`market_research_run\\\` after the user approves. Returns TAM count, sample domains, and review data. NOTE: \\\`market_research_run\\\` costs real money — always confirm with the user and pass \\\`user_explicitly_approved: true\\\`. +- **"What are competitors spending on Google Ads?"** → \\\`market_seo_analyze { domain }\\\`. Returns organic traffic, paid traffic, ad spend, and top paid keywords. Use to tell the user how aggressive a market is. +- **"What software do these businesses already use?"** → \\\`tech_stack_analyze { urls, software_category_id }\\\`. Detects WordPress, Shopify, named competitors, and any custom domains/scripts you pass. Use to find "X businesses use WordPress but lack Y" market gaps. +- **"What are customers complaining about?"** → \\\`market_aggregate_insights { category, location }\\\`. Returns top review topics — use the actual words customers use as marketing copy and value-prop seeds. +- **"Who are the players in this niche?"** → \\\`market_competitor_research { niche }\\\`. Returns proprietary competitors with pricing AND open-source alternatives that could be forked. + +These are conversational research tools — they don't build anything. Use them BEFORE scaffolding when the user is exploring direction; SKIP them once the user has committed to building.`; + +code = code.replace(commonQuestionsBlockMatched, commonQuestionsBlockMatched + marketResearchBlock); + +fs.writeFileSync(file, code); +console.log("Applied Phase 2 and 3 prompt fixes"); diff --git a/vibn-frontend/app/api/chat/route.ts b/vibn-frontend/app/api/chat/route.ts index 8df579f..f432246 100644 --- a/vibn-frontend/app/api/chat/route.ts +++ b/vibn-frontend/app/api/chat/route.ts @@ -215,6 +215,8 @@ Each project has a persistent \`vibn-dev\` container. Edit files via \`fs_*\` an **Start a coding session:** \`devcontainer_ensure { projectId }\` (idempotent; first call ~10s, then instant). +**Orient yourself once.** On the first code-modifying turn of a chat, call \`fs_tree\` once to learn the repo layout. Don't re-run it on every turn — the layout doesn't change between user messages. + **Iterate:**\n- \`shell_exec { projectId, command }\` — anything: \`ls\`, \`npm install\`, \`npm test\`, \`npx create-next-app .\`, \`git status\`. Cwd defaults to \`/workspace\`. Node (LTS), Python 3.12, and Go 1.23 are pre-installed — no setup needed.\n- \`fs_read\` / \`fs_write\` / \`fs_edit { path, oldString, newString, startLine, endLine }\`. IMPORTANT: For fs_edit, ALWAYS prefer using \`oldString\` for small replacements if you are confident. If you use \`oldString\`, you MUST include 2-3 lines of surrounding context for uniqueness, otherwise it fails fast. If you are replacing large blocks, use \`startLine\` and \`endLine\` instead.\n- \`fs_glob\` / \`fs_grep\` (ripgrep, respects .gitignore) / \`fs_list\` / \`fs_delete\`.\n **Dev servers (preview URL via \`*.preview.vibnai.com\` wildcard):** @@ -250,7 +252,7 @@ For NEW repos / branches: \`gitea_repos_list\`, \`gitea_repo_get\`, \`gitea_repo ## Troubleshooting - **Dev container stuck provisioning (>120s)**: \`devcontainer_status\` returns \`likelyFailed: true\` and a \`coolifyStatus\` field with Coolify's view. If \`blockedReason\` is set, TELL THE USER the specific reason ("SSH not configured", "Coolify deploy failed: image pull error") instead of continuing to poll. Do NOT loop on \`devcontainer_status\` — a stuck container will NOT self-heal. If the status says "failed" or "error", advise the user to check their Coolify dashboard or regenerate the project. -- "exited (1)" / deploy stuck → \`apps_logs { uuid }\` + \`apps_containers_list { uuid }\`. Usual: missing env, wrong port, image pull fail. +- "exited (1)" / deploy stuck → \`apps_logs { uuid }\` + \`apps_containers_ps { uuid }\`. Usual: missing env, wrong port, image pull fail. - 502 / "no available server" → \`apps_get\`; if \`fqdn\` is empty, attach a domain. - "tenant" / "does not belong to" → uuid not in this workspace. Re-list with \`apps_list\`. - Compose stack weird → \`apps_repair { uuid }\` re-applies Traefik labels + port forwarding. diff --git a/vibn-frontend/app/api/mcp/route.ts b/vibn-frontend/app/api/mcp/route.ts index 417011f..186371a 100644 --- a/vibn-frontend/app/api/mcp/route.ts +++ b/vibn-frontend/app/api/mcp/route.ts @@ -4491,28 +4491,44 @@ function shq(s: string): string { return `'${s.replace(/'/g, `'\\''`)}'`; } -function normalizeFsPath(p: string): string | NextResponse { +function normalizeFsPath( + p: string, + projectSlug?: string, +): string | NextResponse { if (!p || typeof p !== "string") { return NextResponse.json( { error: 'Param "path" is required' }, { status: 400 }, ); } + const projectRoot = projectSlug ? `${FS_ROOT}/${projectSlug}` : FS_ROOT; let abs: string; if (p.startsWith("/")) { abs = p; } else { - abs = `${FS_ROOT}/${p}`.replace(/\/+/g, "/"); + abs = `${projectRoot}/${p}`.replace(/\/+/g, "/"); } - // Disallow .. traversal that escapes /workspace. const norm = abs.replace(/\/[^/]+\/\.\.(?=\/|$)/g, "").replace(/\/+/g, "/"); - if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) { - return NextResponse.json( - { - error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.`, - }, - { status: 400 }, - ); + + // When projectSlug is set, REJECT paths outside the project root. + if (projectSlug) { + if (!norm.startsWith(projectRoot) && norm !== projectRoot) { + return NextResponse.json( + { + ok: false, + error: `PATH_OUTSIDE_PROJECT: path "${p}" resolves to "${norm}" which is outside the active project at "${projectRoot}". Did you mean "${projectRoot}/${p.replace(/^\/+/, "")}"?`, + }, + { status: 400 }, + ); + } + } else { + // Workspace-level fallback (legacy behaviour) + if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) { + return NextResponse.json( + { error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.` }, + { status: 400 }, + ); + } } return norm; } @@ -4627,7 +4643,7 @@ async function toolFsRead(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "")); + const path = normalizeFsPath(String(params.path ?? ""), project.slug); if (path instanceof NextResponse) return path; const offset = Number.isFinite(Number(params.offset)) @@ -4873,7 +4889,7 @@ async function toolFsWrite(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "")); + const path = normalizeFsPath(String(params.path ?? ""), project.slug); if (path instanceof NextResponse) return path; const content = typeof params.content === "string" ? params.content : ""; @@ -4890,8 +4906,11 @@ async function toolFsWrite(principal: Principal, params: Record) { { status: 500 }, ); } + const { createHash } = require('crypto'); + const bytes = Buffer.byteLength(content, "utf8"); + const sha256 = createHash("sha256").update(content, "utf8").digest("hex"); return NextResponse.json({ - result: { path, bytesWritten: Buffer.byteLength(content, "utf8") }, + result: { ok: true, path, bytes, sha256 }, }); } @@ -4900,7 +4919,7 @@ async function toolFsEdit(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "")); + const path = normalizeFsPath(String(params.path ?? ""), project.slug); if (path instanceof NextResponse) return path; const newString = @@ -4966,7 +4985,7 @@ print(n)`; const b64 = Buffer.from(JSON.stringify(payload), "utf8").toString("base64"); const pyB64 = Buffer.from(py, "utf8").toString("base64"); - const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)"`; + const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)" && echo "---" && sha256sum ${shq(path)} | cut -d' ' -f1 && wc -c < ${shq(path)}`; const r = await runFsCmd(principal, project, cmd); if (r.code !== 0) { @@ -4979,8 +4998,18 @@ print(n)`; { status }, ); } + const stdoutParts = r.stdout.split('---'); + const replacementsStr = stdoutParts[0].trim(); + const hashAndSize = stdoutParts[1] ? stdoutParts[1].trim().split('\n') : []; + return NextResponse.json({ - result: { path, replacements: parseInt(r.stdout.trim() || "0", 10) }, + result: { + ok: true, + path, + replacements: parseInt(replacementsStr || "0", 10), + sha256: hashAndSize[0] ? hashAndSize[0].trim() : undefined, + bytes: hashAndSize[1] ? parseInt(hashAndSize[1].trim(), 10) : undefined + }, }); } @@ -4989,7 +5018,7 @@ async function toolFsList(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "/workspace")); + const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug); if (path instanceof NextResponse) return path; const cmd = `cd ${shq(path)} && ls -lA --time-style=long-iso 2>&1 | head -200`; const r = await runFsCmd(principal, project, cmd); @@ -5003,7 +5032,7 @@ async function toolFsDelete(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "")); + const path = normalizeFsPath(String(params.path ?? ""), project.slug); if (path instanceof NextResponse) return path; const recursive = Boolean(params.recursive); // Belt-and-suspenders: never let `rm -rf /workspace` itself slip through. @@ -5036,7 +5065,7 @@ async function toolFsGlob(principal: Principal, params: Record) { { status: 400 }, ); } - const cwd = normalizeFsPath(String(params.cwd ?? "/workspace")); + const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug); if (cwd instanceof NextResponse) return cwd; // ripgrep --files --glob is faster + smarter than `find` and respects .gitignore. const cmd = `cd ${shq(cwd)} && rg --files --glob ${shq(pattern)} | head -500`; @@ -5062,7 +5091,7 @@ async function toolFsGrep(principal: Principal, params: Record) { { status: 400 }, ); } - const cwd = normalizeFsPath(String(params.cwd ?? "/workspace")); + const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug); if (cwd instanceof NextResponse) return cwd; const glob = typeof params.glob === "string" && params.glob.trim() @@ -6005,7 +6034,7 @@ async function toolFsTree(principal: Principal, params: Record) { if (guard) return guard; const project = await resolveProjectOr404(principal, params); if (project instanceof NextResponse) return project; - const path = normalizeFsPath(String(params.path ?? "/workspace")); + const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug); if (path instanceof NextResponse) return path; // Use find to generate a tree structure, ignoring node_modules and .git