fix(ai): implement Phase 2 and 3 prompt recommendations from review

2026-05-19 13:47:18 -07:00
parent 618f7796b2
commit bbcd4ad55e
6 changed files with 857 additions and 22 deletions
--- a/fix_mcp_fs.js
+++ b/fix_mcp_fs.js
@@ -0,0 +1,48 @@
 const fs = require('fs');
 const file = 'vibn-frontend/app/api/mcp/route.ts';
 let code = fs.readFileSync(file, 'utf8');
 // 1. Fix toolFsWrite
 const oldFsWriteReturn = `  return NextResponse.json({
    result: { path, bytesWritten: Buffer.byteLength(content, "utf8") },
  });`;
 const newFsWriteReturn = `  const { createHash } = require('crypto');
  const bytes = Buffer.byteLength(content, "utf8");
  const sha256 = createHash("sha256").update(content, "utf8").digest("hex");
  return NextResponse.json({
    result: { ok: true, path, bytes, sha256 },
  });`;
 code = code.replace(oldFsWriteReturn, newFsWriteReturn);
 // 2. Fix toolFsEdit
 const oldFsEditCmd = `const cmd = \`python3 -c "$(printf %s \${shq(pyB64)} | base64 -d)" <<< "$(printf %s \${shq(b64)} | base64 -d)"\`;`;
 const newFsEditCmd = `const cmd = \`python3 -c "$(printf %s \${shq(pyB64)} | base64 -d)" <<< "$(printf %s \${shq(b64)} | base64 -d)" && echo "---" && sha256sum \${shq(path)} | cut -d' ' -f1 && wc -c < \${shq(path)}\`;`;
 code = code.replace(oldFsEditCmd, newFsEditCmd);
 const oldFsEditReturn = `  return NextResponse.json({
    result: { path, replacements: parseInt(r.stdout.trim() || "0", 10) },
  });`;
 const newFsEditReturn = `  const stdoutParts = r.stdout.split('---');
  const replacementsStr = stdoutParts[0].trim();
  const hashAndSize = stdoutParts[1] ? stdoutParts[1].trim().split('\\n') : [];
  return NextResponse.json({
    result: { 
      ok: true, 
      path, 
      replacements: parseInt(replacementsStr || "0", 10),
      sha256: hashAndSize[0] ? hashAndSize[0].trim() : undefined,
      bytes: hashAndSize[1] ? parseInt(hashAndSize[1].trim(), 10) : undefined
    },
  });`;
 code = code.replace(oldFsEditReturn, newFsEditReturn);
 fs.writeFileSync(file, code);
 console.log("Patched toolFsWrite and toolFsEdit");
--- a/fix_mcp_fs_normalize.js
+++ b/fix_mcp_fs_normalize.js
@@ -0,0 +1,82 @@
 const fs = require('fs');
 const file = 'vibn-frontend/app/api/mcp/route.ts';
 let code = fs.readFileSync(file, 'utf8');
 const oldNormalize = `function normalizeFsPath(p: string): string | NextResponse {
  if (!p || typeof p !== "string") {
    return NextResponse.json(
      { error: 'Param "path" is required' },
      { status: 400 },
    );
  }
  let abs: string;
  if (p.startsWith("/")) {
    abs = p;
  } else {
    abs = \`\${FS_ROOT}/\${p}\`.replace(/\\/+/g, "/");
  }
  // Disallow .. traversal that escapes /workspace.
  const norm = abs.replace(/\\/[^/]+\\/\\.\\.(?=\\/|$)/g, "").replace(/\\/+/g, "/");
  if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) {
    return NextResponse.json(
      {
        error: \`Path "\${p}" is outside \${FS_ROOT}; use shell.exec for system paths.\`,
      },
      { status: 400 },
    );
  }
  return norm;
 }`;
 const newNormalize = `function normalizeFsPath(
  p: string,
  projectSlug?: string,
 ): string | NextResponse {
  if (!p || typeof p !== "string") {
    return NextResponse.json(
      { error: 'Param "path" is required' },
      { status: 400 },
    );
  }
  const projectRoot = projectSlug ? \`\${FS_ROOT}/\${projectSlug}\` : FS_ROOT;
  let abs: string;
  if (p.startsWith("/")) {
    abs = p;
  } else {
    abs = \`\${projectRoot}/\${p}\`.replace(/\\/+/g, "/");
  }
  const norm = abs.replace(/\\/[^/]+\\/\\.\\.(?=\\/|$)/g, "").replace(/\\/+/g, "/");
  // When projectSlug is set, REJECT paths outside the project root.
  if (projectSlug) {
    if (!norm.startsWith(projectRoot) && norm !== projectRoot) {
      return NextResponse.json(
        {
          ok: false,
          error: \`PATH_OUTSIDE_PROJECT: path "\${p}" resolves to "\${norm}" which is outside the active project at "\${projectRoot}". Did you mean "\${projectRoot}/\${p.replace(/^\\/+/, "")}"?\`,
        },
        { status: 400 },
      );
    }
  } else {
    // Workspace-level fallback (legacy behaviour)
    if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) {
      return NextResponse.json(
        { error: \`Path "\${p}" is outside \${FS_ROOT}; use shell.exec for system paths.\` },
        { status: 400 },
      );
    }
  }
  return norm;
 }`;
 code = code.replace(oldNormalize, newNormalize);
 code = code.replace(/const path = normalizeFsPath\(String\(params\.path \?\? ""\)\);/g, 'const path = normalizeFsPath(String(params.path ?? ""), project.slug);');
 code = code.replace(/const path = normalizeFsPath\(String\(params\.path \?\? "\/workspace"\)\);/g, 'const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug);');
 code = code.replace(/const cwd = normalizeFsPath\(String\(params\.cwd \?\? "\/workspace"\)\);/g, 'const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug);');
 code = code.replace(/const targetPath = normalizeFsPath\(String\(params\.targetPath \?\? ""\)\);/g, 'const targetPath = normalizeFsPath(String(params.targetPath ?? ""), project.slug);');
 fs.writeFileSync(file, code);
 console.log("Patched normalizeFsPath with projectSlug scoping");
--- a/may-19-ai-review.md
+++ b/may-19-ai-review.md
@@ -0,0 +1,601 @@
 # Vibn Chat Harness — Fix Checklist
 Work through items in order. Each fix has a clear **What**, **Where**,
 **How**, and **Verify** section. Don't skip the verify step — many of
 these fixes interact with each other and silent failures will
 compound.
 Mark `[x]` as you complete each item. If you can't complete an item,
 add a short note under it explaining why and move on.
 ---
 ## Phase 1 — Backend fixes (highest leverage; do these first)
 These three fix the failure modes the prompt currently promises but
 the backend doesn't deliver. Until they're done, the prompt's hard
 rules are partly fiction.
 ### [ ] 1. Add `sha256` and `bytes` to `fs.write` and `fs.edit` responses
 **What:** The prompt's hard rules tell the model to cite `sha256` and
 `bytes` as evidence of file changes. The tools don't return those
 fields today, so the model is looking for evidence that doesn't exist.
 **Where:** `app/api/mcp/route.ts` — functions `toolFsWrite` and
 `toolFsEdit`.
 **How:**
 - In `toolFsWrite`, after the `runFsCmd` success branch, before
  returning, compute the sha256 of `content` and return it alongside
  `bytesWritten` renamed to `bytes`:
  ```ts
  import { createHash } from "crypto";
  // ...
  const bytes = Buffer.byteLength(content, "utf8");
  const sha256 = createHash("sha256").update(content, "utf8").digest("hex");
  return NextResponse.json({
    result: { ok: true, path, bytes, sha256 },
  });
  ```
 - In `toolFsEdit`, you don't have the final content in memory. Add a
  second command that prints the sha + bytes after the edit:
  ```ts
  const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)" && echo "---" && sha256sum ${shq(path)} | cut -d' ' -f1 && wc -c < ${shq(path)}`;
  ```
  Then parse the trailing two lines after `---` for sha and bytes.
 - Update the response shape:
  ```ts
  return NextResponse.json({
    result: { ok: true, path, replacements, bytes, sha256 },
  });
  ```
 **Verify:**
 - [ ] Call `fs_write` with `{ path: "test.txt", content: "hello" }`.
  Confirm response contains `sha256` (64 hex chars) and `bytes: 5`.
 - [ ] Call `fs_edit` to change the same file. Confirm response
  contains a new `sha256` and updated `bytes`.
 - [ ] Replay a turn that does `fs_write` followed by `fs_read` of the
  same file in chat. The model should now produce text like
  "Updated `test.txt` (sha=a3f5c2…, 5b)" instead of a bare claim.
 ---
 ### [ ] 2. Add project-slug scoping to `normalizeFsPath`
 **What:** The prompt tells the model to use paths like `src/app/page.tsx`
 and claims the tool layer rejects writes outside the project root.
 The tool layer does NOT do this today. It resolves all relative paths
 under `/workspace` (workspace-level), so `fs_write { path: "src/app/page.tsx" }`
 ends up at `/workspace/src/app/page.tsx` — the ghost file from the
 failing session. Five different path conventions were used for the
 same file in one session because nothing enforces the rule.
 **Where:** `app/api/mcp/route.ts` — function `normalizeFsPath` and
 every caller in `toolFsRead`, `toolFsWrite`, `toolFsEdit`,
 `toolFsList`, `toolFsDelete`, `toolFsGlob`, `toolFsGrep`, `toolFsTree`,
 `toolRequestVisualQA`, `toolGenerateMedia`.
 **How:**
 - Change `normalizeFsPath` to accept an optional `projectSlug`:
  ```ts
  function normalizeFsPath(
    p: string,
    projectSlug?: string,
  ): string | NextResponse {
    if (!p || typeof p !== "string") {
      return NextResponse.json(
        { error: 'Param "path" is required' },
        { status: 400 },
      );
    }
    const projectRoot = projectSlug ? `${FS_ROOT}/${projectSlug}` : FS_ROOT;
    let abs: string;
    if (p.startsWith("/")) {
      abs = p;
    } else {
      abs = `${projectRoot}/${p}`.replace(/\/+/g, "/");
    }
    const norm = abs.replace(/\/[^/]+\/\.\.(?=\/|$)/g, "").replace(/\/+/g, "/");
    // When projectSlug is set, REJECT paths outside the project root.
    if (projectSlug) {
      if (!norm.startsWith(projectRoot) && norm !== projectRoot) {
        return NextResponse.json(
          {
            ok: false,
            error: `PATH_OUTSIDE_PROJECT: path "${p}" resolves to "${norm}" which is outside the active project at "${projectRoot}". Did you mean "${projectRoot}/${p.replace(/^\/+/, "")}"?`,
          },
          { status: 400 },
        );
      }
    } else {
      // Workspace-level fallback (legacy behaviour)
      if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) {
        return NextResponse.json(
          { error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.` },
          { status: 400 },
        );
      }
    }
    return norm;
  }
  ```
 - In every fs_* tool that already calls `resolveProjectOr404`, pass
  the slug:
  ```ts
  const path = normalizeFsPath(String(params.path ?? ""), project.slug);
  ```
 - `toolFsRead`, `toolFsWrite`, `toolFsEdit`, `toolFsDelete`,
  `toolRequestVisualQA`, `toolGenerateMedia` all already have
  `project` in scope — pass `project.slug`.
 - `toolFsList`, `toolFsGlob`, `toolFsGrep`, `toolFsTree` use
  `params.path` or `params.cwd` — same treatment, pass `project.slug`.
 **Verify:**
 - [ ] From a project-scoped thread, call `fs_write { path: "/workspace/src/app/page.tsx", content: "x" }`.
  Expect `PATH_OUTSIDE_PROJECT` error.
 - [ ] From the same thread, call `fs_write { path: "src/app/page.tsx", content: "x" }`.
  Expect success at `/workspace/<slug>/src/app/page.tsx`.
 - [ ] Confirm `dev_server_start` with `command: "npm run dev"` runs
  from the project root, not `/workspace`. (This is mostly already
  true via dev-container logic; just confirm.)
 ---
 ### [ ] 3. Fix the broken `plan-extract` block
 **What:** The fire-and-forget `plan-extract` block in
 `app/api/chat/route.ts` has a syntax error — the `try` block builds a
 transcript and then hits `}` followed by `catch` with no actual call
 to `autoExtractPlanUpdates`. The body of the try is missing. Either
 the auto-extraction was intentionally removed (in which case the
 dead transcript-building code should also be deleted) or it was
 accidentally truncated (in which case the call needs to be restored).
 **Where:** `app/api/chat/route.ts`, around the second fire-and-forget
 block (after the title/summary block, before `emit({ type: "done" })`).
 **How:**
 - Decide first: do we want auto-extraction to run? If YES, restore
  the call:
  ```ts
  (async () => {
    try {
      if (!threadProjectId) return;
      const allMessages = [...history, finalMsg];
      if (allMessages.length < 2) return;
      const transcript = allMessages
        .map((m) => {
          const text =
            typeof m.content === "string"
              ? m.content
              : JSON.stringify(m.content);
          return `${m.role.toUpperCase()}: ${text.slice(0, 1200)}`;
        })
        .join("\n\n");
      const result = await autoExtractPlanUpdates(
        threadProjectId,
        transcript,
      );
      if (result) {
        console.log(
          "[chat] plan-extract:",
          `${result.tasks} tasks, ${result.decisions} decisions, vision=${result.vision}`,
        );
      }
    } catch (err) {
      console.warn("[chat] plan-extract failed (non-fatal):", err);
    }
  })().catch(() => {});
  ```
  And re-add the `import { autoExtractPlanUpdates } from "@/lib/ai/plan-extract";`
  at the top of the file.
 - If NO (you removed it intentionally), delete the entire IIFE
  including the transcript-building so the file compiles cleanly.
 **Verify:**
 - [ ] Run `tsc --noEmit` on the file. Confirm no syntax errors.
 - [ ] If auto-extraction restored: have a chat that mentions a
  decision (e.g. "let's use Postgres"). Confirm a new entry appears
  in the project's `plan.decisions` with `confidence: "auto"`.
 - [ ] Tail prod logs for `[chat] plan-extract:` — should fire on
  every turn with content.
 ---
 ## Phase 2 — Prompt fixes (now that the backend matches)
 These bring the prompt into line with what the tools actually do.
 ### [ ] 4. Fix the `apps_containers_list` typo in the prompt
 **What:** The troubleshooting section references `apps_containers_list`
 but the actual tool is `apps_containers_ps`. The model will call a
 tool that doesn't exist.
 **Where:** `app/api/chat/route.ts`, inside `buildSystemPrompt`, in the
 "## Troubleshooting" section.
 **How:**
 - Find: `apps_logs { uuid } + apps_containers_list { uuid }`
 - Replace: `apps_logs { uuid } + apps_containers_ps { uuid }`
 **Verify:**
 - [ ] Grep the prompt for `apps_containers_list` — no matches.
 - [ ] Grep for `apps_containers_ps` — should appear in
  troubleshooting and at least once in the apps section.
 ---
 ### [ ] 5. Soften the `ok` field rule
 **What:** The current rule says "If `ok` is false (or absent, or
 `exitCode` is non-zero, or `healthCheck.status` is >= 400) the
 operation FAILED." The "or absent" clause is wrong — many tools
 return data without an `ok` field (e.g. `projects_get`, `apps_list`,
 `databases_get`). The model will treat every read as a failure.
 **Where:** `buildSystemPrompt`, "Hard rules" section, the "Trust the
 `ok` field" bullet.
 **How:**
 Replace the current rule with:
 ```
 - **Read tool results carefully.** A tool FAILED when ANY of these
  signals are present: `ok: false`, `error: "..."`, a non-zero
  `exitCode`, or a `healthCheck.status` >= 400. If NONE of those
  signals are present, look at the actual content of the response
  to decide whether the operation succeeded. Many read-only tools
  return data directly without an `ok` field — that's not a failure.
 ```
 **Verify:**
 - [ ] Pick a recent thread where the agent called `projects_get` or
  `apps_list`. Confirm the agent didn't treat the response as a
  failure (look at its post-tool text — should be a normal summary,
  not "the operation failed").
 ---
 ### [ ] 6. Tighten the status-nudge threshold
 **What:** Current thresholds are `roundsSinceText >= 8 ||
 toolCallsSinceText >= 12`. With `MAX_TOOL_ROUNDS = 8`, the round-based
 nudge can never fire (loop ends first). The tool-call threshold of 12
 is also too lenient — users typed "test" / "hello" by round 4-5 of
 silence in the failing session.
 **Where:** `app/api/chat/route.ts`, near the top of the main while
 loop, the `isSilent` constant.
 **How:**
 ```ts
 const isSilent = roundsSinceText >= 3 || toolCallsSinceText >= 6;
 ```
 **Verify:**
 - [ ] Replay a chat that triggers 6+ tool calls without text. Confirm
  the `[STATUS NUDGE]` system addendum is injected before the next
  round.
 - [ ] Confirm the model produces a one-line status sentence in
  response to the nudge.
 ---
 ### [ ] 7. Update the path-convention guidance in the prompt
 **What:** After Fix 2 ships, the path convention is now enforced. The
 prompt should state this plainly without the "cd into your project"
 workaround.
 **Where:** `buildSystemPrompt`, inside `activeBlock`, the path
 guidance section. Also inside "Dev servers" → "Directory" bullet.
 **How:**
 Replace the "Directory" bullet under "Dev servers":
 ```
 - **Directory:** Tool paths are scoped to your project root
  automatically. Pass `command: "npm run dev"` directly — no `cd`
  prefix needed. The tool rejects any `fs_*` write outside
  `/workspace/<slug>/`.
 ```
 And after the "Project repo is auto-cloned" paragraph in
 `activeBlock`, add:
 ```
 **Path convention for fs_* tools:** Pass paths relative to the
 project root — `src/app/page.tsx`, NOT `/workspace/<slug>/src/app/page.tsx`
 and NOT `<slug>/src/app/page.tsx`. The tool layer rejects writes
 outside the project root with a `PATH_OUTSIDE_PROJECT` error
 suggesting the corrected path.
 ```
 **Verify:**
 - [ ] In a fresh chat, ask the agent to "edit the homepage". Confirm
  the first `fs_read` call uses `src/app/page.tsx` (no slug prefix,
  no `/workspace/` prefix).
 ---
 ## Phase 3 — Polish and safety nets
 These are lower-priority but each removes a small foot-gun.
 ### [ ] 8. Add `fs_tree` recommendation to first-turn behavior
 **What:** The agent ran `fs_tree` 5 times and `fs_glob` 9+ times in
 the failing session, re-discovering paths it should have learned once.
 The tool description already says "ALWAYS call this first" but the
 prompt doesn't reinforce it.
 **Where:** `buildSystemPrompt`, in the "Writing code" section.
 **How:**
 Add this near the top of "Writing code — dev container is the default":
 ```
 **Orient yourself once.** On the first code-modifying turn of a
 chat, call `fs_tree` once to learn the repo layout. Don't re-run it
 on every turn — the layout doesn't change between user messages.
 ```
 **Verify:**
 - [ ] Manual review of the next 5 sessions: confirm `fs_tree` is
  called at most once per chat (not per turn).
 ---
 ### [ ] 9. Add `browser_navigate` and `browser_console` as verification primitives
 **What:** The backend has `browser_navigate` and `browser_console`
 tools that headlessly render a page and capture console errors. The
 prompt never mentions them. These are the missing post-deploy
 verification step that the `healthCheck` field gestures at.
 **Where:** `buildSystemPrompt`, in the "Dev servers" subsection or as
 its own section after "Visual QA".
 **How:**
 Add a new bullet right after "Visual QA" guidance:
 ```
 **Verify the page actually renders:**
 - After `dev_server_start` returns a `previewUrl` AND `healthCheck.status === 200`,
  for any UI-facing turn, call `browser_console { url: previewUrl }` to
  capture frontend console errors. Hydration errors, missing assets,
  and uncaught exceptions show up here even when the server is
  technically "running".
 - If `browser_console` returns errors, fix them with `fs_edit`
  before declaring done. A green `healthCheck` plus a clean console
  is the real "done" signal for UI work.
 - Skip this for backend / SQL / config-only changes.
 ```
 **Verify:**
 - [ ] On the next UI-modifying chat, confirm `browser_console` is
  called once after `dev_server_start`.
 - [ ] Confirm any errors it returns get acknowledged in the agent's
  reply.
 ---
 ### [ ] 10. Add the market research stack to the prompt
 **What:** The backend exposes six market research tools
 (`market_categories_suggest`, `market_research_run`, `market_seo_analyze`,
 `tech_stack_analyze`, `market_competitor_research`,
 `market_aggregate_insights`). The prompt never mentions them. For a
 non-technical-founder product, these are some of the highest-leverage
 tools — they answer "should I build for dentists or summer camps?"
 with real TAM counts.
 **Where:** `buildSystemPrompt`, add a new section after "Common
 questions → tools" or before "How to deploy".
 **How:**
 Add this section:
 ```
 ## Helping the user pick what to build
 Vibn has a market-research toolkit for non-technical founders who
 need data on their target niche. Use it when the user is undecided,
 validating an idea, or comparing markets:
 - **"How big is the market for X in <location>?"** → `market_categories_suggest { niche }` to
  propose Google Business categories, then `market_research_run` after
  the user approves. Returns TAM count, sample domains, and review
  data. NOTE: `market_research_run` costs real money — always confirm
  with the user and pass `user_explicitly_approved: true`.
 - **"What are competitors spending on Google Ads?"** → `market_seo_analyze { domain }`.
  Returns organic traffic, paid traffic, ad spend, and top paid
  keywords. Use to tell the user how aggressive a market is.
 - **"What software do these businesses already use?"** → `tech_stack_analyze { urls, software_category_id }`.
  Detects WordPress, Shopify, named competitors, and any custom
  domains/scripts you pass. Use to find "X businesses use WordPress
  but lack Y" market gaps.
 - **"What are customers complaining about?"** → `market_aggregate_insights { category, location }`.
  Returns top review topics — use the actual words customers use as
  marketing copy and value-prop seeds.
 - **"Who are the players in this niche?"** → `market_competitor_research { niche }`.
  Returns proprietary competitors with pricing AND open-source
  alternatives that could be forked.
 These are conversational research tools — they don't build anything.
 Use them BEFORE scaffolding when the user is exploring direction;
 SKIP them once the user has committed to building.
 ```
 **Verify:**
 - [ ] In a fresh chat, ask the agent "should I build for dentists or
  summer camps?". Confirm it proposes using `market_categories_suggest`
  or `market_aggregate_insights` rather than guessing.
 ---
 ### [ ] 11. Surface `apps_exec`, `auth_create`, `generate_media`, `storage_*` briefly
 **What:** Several capabilities the agent has are completely absent
 from the prompt. The agent doesn't know it can run commands inside
 production containers, deploy real auth servers, generate images, or
 wire S3 storage. One-line mentions are enough.
 **Where:** `buildSystemPrompt`, scattered across existing sections.
 **How:**
 - Under "Common questions → tools", add:
  ```
  - "Run a migration / psql in prod" → `apps_exec { uuid, command }`.
  - "Generate a hero image / illustration" → `generate_media { prompt, type, outputPath }`.
  - "Wire up file storage / uploads" → `storage_provision` (if not already), then `storage_inject_env { uuid }`.
  ```
 - In the "Decision defaults" section, augment the auth bullet:
  ```
  - **Auth:** NextAuth with email magic-link for in-app auth.
    Deploy a separate Pocketbase / Authentik / Keycloak service via
    `auth_create { provider }` only if the user needs SSO, multi-app
    SSO, or admin user management.
  ```
 **Verify:**
 - [ ] Ask the agent "can you run a SQL migration on prod?". Confirm
  it references `apps_exec`.
 - [ ] Ask "I need a hero image for the landing page". Confirm it
  references `generate_media`.
 ---
 ### [ ] 12. Flip the `fs_edit` preference to match the tool's documentation
 **What:** The tool description says `startLine`/`endLine` is
 preferred and `oldString` is the fallback. The prompt currently says
 the opposite ("prefer `oldString` for small replacements"). The tool
 is authoritative — match it.
 **Where:** `buildSystemPrompt`, in the "Iterate" bullet under "Writing
 code", the `fs_edit` guidance.
 **How:**
 Replace the current `fs_edit` guidance with:
 ```
 - `fs_read` / `fs_write` / `fs_edit { path, oldString, newString, startLine, endLine }`.
  **For `fs_edit`:** prefer `startLine`/`endLine` (deterministic; never
  fails on duplicate strings). Use `oldString` only when you cannot
  read the file first to get line numbers — and when you do, include
  2-3 lines of surrounding context for uniqueness. If `fs_edit` keeps
  failing, do NOT escape to `shell_exec` with patch scripts — read
  the file fresh with `fs_read`, get the line numbers, and try again.
 ```
 **Verify:**
 - [ ] Confirm next 3 `fs_edit` calls in the wild use `startLine`/`endLine`,
  not `oldString`.
 ---
 ## Phase 4 — Monitoring (after Phases 1-3 land)
 Once the changes are in production, watch these for a week. Tune if
 the numbers don't move.
 ### [ ] 13. Track the recovery-summary fire rate
 **What:** The `needsRecovery` path runs when a turn ends badly (hit
 the round cap, hit a loop, or last tool returned failure). It should
 fire on <10% of turns. If it fires more often, the cap is too low or
 the model is hitting real bugs.
 **How:** Add a metric. In the `needsRecovery` block, before calling
 `callVibnChat`, emit:
 ```ts
 console.log("[chat] recovery_fired", {
  turnId,
  reason: loopBreakReason ? "loop"
    : round >= MAX_TOOL_ROUNDS ? "round_cap"
    : assistantText.trim().length === 0 ? "no_text"
    : "tool_failure",
  toolCalls: assistantToolCalls.length,
 });
 ```
 **Verify:**
 - [ ] Aggregate over 1 week. If `round_cap` is the dominant reason
  and the turns look legitimate, raise `MAX_TOOL_ROUNDS` to 10 or 12.
 - [ ] If `loop` is dominant, the fingerprinter may need tuning.
 ---
 ### [ ] 14. Track conversational-guard fire rate
 **What:** The first-turn conversational guard (regex match on user
 message → no tools on round 1) is the biggest single behavioral
 change in this revision. We want to know how often it fires and
 whether it ever causes a problem.
 **How:** Before the loop, after computing `firstMessageIsConversational`:
 ```ts
 console.log("[chat] turn_start", {
  turnId,
  firstMessageIsConversational,
  messagePreview: message.trim().slice(0, 80),
 });
 ```
 **Verify:**
 - [ ] Aggregate over 1 week. Should fire on ~30-40% of first messages
  in a chat.
 - [ ] Spot-check 10 cases where it fired: confirm none were
  legitimate "build me X" requests being miscategorized.
 ---
 ### [ ] 15. Track `PATH_OUTSIDE_PROJECT` rejections
 **What:** After Fix 2 ships, this error tells you how often the model
 was about to write to the wrong path. Should taper toward zero as the
 prompt guidance + enforcement settles in.
 **How:** Server-side log in `normalizeFsPath` when the project-scoped
 rejection fires.
 **Verify:**
 - [ ] Week 1: count rejections per day.
 - [ ] Week 2: count should be lower (model has internalized the rule
  via the error messages it's seen).
 ---
 ## What this checklist deliberately doesn't include
 A few things from earlier reviews that I'm intentionally leaving off:
 - **Phase-aware behavior.** Phases were removed from the product; the
  prompt no longer references them. No work needed.
 - **Codebase summary auto-generation.** Lower priority once Fix 2
  ships (paths can no longer drift) and Fix 8 lands (one `fs_tree`
  per chat instead of nine).
 - **Tool history reconstruction for DeepSeek compatibility.** Already
  shipped in the current code via the `_rawToolResults` compact
  summary. No additional work.
 Don't add work for these unless a clear failure mode appears in
 production after Phases 1-3 land.
--- a/patch_prompts.js
+++ b/patch_prompts.js
@@ -0,0 +1,73 @@
 const fs = require('fs');
 const file = 'vibn-frontend/app/api/chat/route.ts';
 let code = fs.readFileSync(file, 'utf8');
 // Fix 4: apps_containers_list -> apps_containers_ps
 code = code.replace(/apps_containers_list \{ uuid \}/g, 'apps_containers_ps { uuid }');
 // Fix 5: Soften ok field rule
 const oldOkRule = `- **Trust the \\\`ok\\\` field.** Every tool result carries \\\`ok: true | false\\\`. If \\\`ok\\\` is false (or \\\`exitCode\\\` is non-zero, or \\\`healthCheck.status\\\` is >= 400), the operation FAILED. Do not describe a failed operation as successful. Surface the error verbatim and propose a next step.`;
 const newOkRule = `- **Read tool results carefully.** A tool FAILED when ANY of these signals are present: \\\`ok: false\\\`, \\\`error: "..."\\\`, a non-zero \\\`exitCode\\\`, or a \\\`healthCheck.status\\\` >= 400. If NONE of those signals are present, look at the actual content of the response to decide whether the operation succeeded. Many read-only tools return data directly without an \\\`ok\\\` field — that's not a failure.`;
 code = code.replace(oldOkRule, newOkRule);
 // Fix 7: Path conventions
 const oldDirRule = `- **Directory:** the tool resolves paths relative to the active project root — you can pass \\\`command: "npm run dev"\\\` directly. (If you need to manually \\\`cd\\\`, use the project slug.)`;
 const newDirRule = `- **Directory:** Tool paths are scoped to your project root automatically. Pass \\\`command: "npm run dev"\\\` directly — no \\\`cd\\\` prefix needed. The tool rejects any \\\`fs_*\\\` write outside \\\`/workspace/<slug>/\\\`.`;
 code = code.replace(oldDirRule, newDirRule);
 const oldPathConv = `**fs_* path convention for this project:** Pass paths relative to \\\`/workspace/\${activeProject.slug ?? "<slug>"}/\\\` — e.g. \\\`src/app/page.tsx\\\`, not \\\`/workspace/\${activeProject.slug ?? "<slug>"}/src/app/page.tsx\\\` and not \\\`getacquired-style/src/app/page.tsx\\\`. The tool layer rejects paths outside the project root.`;
 const newPathConv = `**Path convention for fs_* tools:** Pass paths relative to the project root — \\\`src/app/page.tsx\\\`, NOT \\\`/workspace/\${activeProject.slug ?? "<slug>"}/src/app/page.tsx\\\` and NOT \\\`\${activeProject.slug ?? "<slug>"}/src/app/page.tsx\\\`. The tool layer rejects writes outside the project root with a \\\`PATH_OUTSIDE_PROJECT\\\` error suggesting the corrected path.`;
 code = code.replace(oldPathConv, newPathConv);
 // Fix 8: fs_tree recommendation
 const devContainerStart = `**Start a coding session:** \\\`devcontainer_ensure { projectId }\\\` (idempotent; first call ~10s, then instant).`;
 const devContainerStartNew = `**Start a coding session:** \\\`devcontainer_ensure { projectId }\\\` (idempotent; first call ~10s, then instant).
 **Orient yourself once.** On the first code-modifying turn of a chat, call \\\`fs_tree\\\` once to learn the repo layout. Don't re-run it on every turn — the layout doesn't change between user messages.`;
 code = code.replace(devContainerStart, devContainerStartNew);
 // Fix 9: browser_console
 const visualQaBlock = `**Visual QA:** \\\`request_visual_qa { targetPath }\\\` critiques a UI file against a 5-dim design rubric. **Call this whenever you modify visual UI code** before returning the \\\`previewUrl\\\`. If it returns actionable issues, fix them with \\\`fs_edit\\\` before ending the turn. Skip for backend / SQL / config / non-visual changes.`;
 const visualQaBlockNew = `**Visual QA:** \\\`request_visual_qa { targetPath }\\\` critiques a UI file against a 5-dim design rubric. **Call this whenever you modify visual UI code** before returning the \\\`previewUrl\\\`. If it returns actionable issues, fix them with \\\`fs_edit\\\` before ending the turn. Skip for backend / SQL / config / non-visual changes.
 **Verify the page actually renders:**
 - After \\\`dev_server_start\\\` returns a \\\`previewUrl\\\` AND \\\`healthCheck.status === 200\\\`, for any UI-facing turn, call \\\`browser_console { url: previewUrl }\\\` to capture frontend console errors. Hydration errors, missing assets, and uncaught exceptions show up here even when the server is technically "running".
 - If \\\`browser_console\\\` returns errors, fix them with \\\`fs_edit\\\` before declaring done. A green \\\`healthCheck\\\` plus a clean console is the real "done" signal for UI work.
 - Skip this for backend / SQL / config-only changes.`;
 code = code.replace(visualQaBlock, visualQaBlockNew);
 // Fix 10: Market research stack
 const commonQuestionsBlock = `## Common questions → tools
 - "What is project X?" → \\\`projects_get { projectId }\\\`
 - "What's running / has a domain?" → \\\`apps_list { projectId }\\\` (or workspace-wide without projectId)
 - "Show logs / containers / env" → \\\`apps_list\\\` to resolve uuid, then \\\`apps_logs\\\` / \\\`apps_containers_list\\\` / \\\`apps_envs_list\\\`
 - "Find an OSS X" → \\\`github_search\\\` (include \\\`license:mit\\\` by default), then \\\`github_file\\\` to read README / docker-compose
 - "What do the docs say about Y?" → \\\`http_fetch\\\``;
 // Oops, we changed apps_containers_list to apps_containers_ps in Fix 4, so let's match the updated one:
 const commonQuestionsBlockMatched = `## Common questions → tools
 - "What is project X?" → \\\`projects_get { projectId }\\\`
 - "What's running / has a domain?" → \\\`apps_list { projectId }\\\` (or workspace-wide without projectId)
 - "Show logs / containers / env" → \\\`apps_list\\\` to resolve uuid, then \\\`apps_logs\\\` / \\\`apps_containers_ps\\\` / \\\`apps_envs_list\\\`
 - "Find an OSS X" → \\\`github_search\\\` (include \\\`license:mit\\\` by default), then \\\`github_file\\\` to read README / docker-compose
 - "What do the docs say about Y?" → \\\`http_fetch\\\``;
 const marketResearchBlock = `
 ## Helping the user pick what to build
 Vibn has a market-research toolkit for non-technical founders who need data on their target niche. Use it when the user is undecided, validating an idea, or comparing markets:
 - **"How big is the market for X in <location>?"** → \\\`market_categories_suggest { niche }\\\` to propose Google Business categories, then \\\`market_research_run\\\` after the user approves. Returns TAM count, sample domains, and review data. NOTE: \\\`market_research_run\\\` costs real money — always confirm with the user and pass \\\`user_explicitly_approved: true\\\`.
 - **"What are competitors spending on Google Ads?"** → \\\`market_seo_analyze { domain }\\\`. Returns organic traffic, paid traffic, ad spend, and top paid keywords. Use to tell the user how aggressive a market is.
 - **"What software do these businesses already use?"** → \\\`tech_stack_analyze { urls, software_category_id }\\\`. Detects WordPress, Shopify, named competitors, and any custom domains/scripts you pass. Use to find "X businesses use WordPress but lack Y" market gaps.
 - **"What are customers complaining about?"** → \\\`market_aggregate_insights { category, location }\\\`. Returns top review topics — use the actual words customers use as marketing copy and value-prop seeds.
 - **"Who are the players in this niche?"** → \\\`market_competitor_research { niche }\\\`. Returns proprietary competitors with pricing AND open-source alternatives that could be forked.
 These are conversational research tools — they don't build anything. Use them BEFORE scaffolding when the user is exploring direction; SKIP them once the user has committed to building.`;
 code = code.replace(commonQuestionsBlockMatched, commonQuestionsBlockMatched + marketResearchBlock);
 fs.writeFileSync(file, code);
 console.log("Applied Phase 2 and 3 prompt fixes");
--- a/vibn-frontend/app/api/chat/route.ts
+++ b/vibn-frontend/app/api/chat/route.ts
@@ -215,6 +215,8 @@ Each project has a persistent \`vibn-dev\` container. Edit files via \`fs_*\` an
 **Start a coding session:** \`devcontainer_ensure { projectId }\` (idempotent; first call ~10s, then instant).
 **Orient yourself once.** On the first code-modifying turn of a chat, call \`fs_tree\` once to learn the repo layout. Don't re-run it on every turn — the layout doesn't change between user messages.
 **Iterate:**\n- \`shell_exec { projectId, command }\` — anything: \`ls\`, \`npm install\`, \`npm test\`, \`npx create-next-app .\`, \`git status\`. Cwd defaults to \`/workspace\`. Node (LTS), Python 3.12, and Go 1.23 are pre-installed — no setup needed.\n- \`fs_read\` / \`fs_write\` / \`fs_edit { path, oldString, newString, startLine, endLine }\`. IMPORTANT: For fs_edit, ALWAYS prefer using \`oldString\` for small replacements if you are confident. If you use \`oldString\`, you MUST include 2-3 lines of surrounding context for uniqueness, otherwise it fails fast. If you are replacing large blocks, use \`startLine\` and \`endLine\` instead.\n- \`fs_glob\` / \`fs_grep\` (ripgrep, respects .gitignore) / \`fs_list\` / \`fs_delete\`.\n
 **Dev servers (preview URL via \`*.preview.vibnai.com\` wildcard):**
@@ -250,7 +252,7 @@ For NEW repos / branches: \`gitea_repos_list\`, \`gitea_repo_get\`, \`gitea_repo
 ## Troubleshooting
 - **Dev container stuck provisioning (>120s)**: \`devcontainer_status\` returns \`likelyFailed: true\` and a \`coolifyStatus\` field with Coolify's view. If \`blockedReason\` is set, TELL THE USER the specific reason ("SSH not configured", "Coolify deploy failed: image pull error") instead of continuing to poll. Do NOT loop on \`devcontainer_status\` — a stuck container will NOT self-heal. If the status says "failed" or "error", advise the user to check their Coolify dashboard or regenerate the project.
- "exited (1)" / deploy stuck → \`apps_logs { uuid }\` + \`apps_containers_list { uuid }\`. Usual: missing env, wrong port, image pull fail.
+- "exited (1)" / deploy stuck → \`apps_logs { uuid }\` + \`apps_containers_ps { uuid }\`. Usual: missing env, wrong port, image pull fail.
 - 502 / "no available server" → \`apps_get\`; if \`fqdn\` is empty, attach a domain.
 - "tenant" / "does not belong to" → uuid not in this workspace. Re-list with \`apps_list\`.
 - Compose stack weird → \`apps_repair { uuid }\` re-applies Traefik labels + port forwarding.
--- a/vibn-frontend/app/api/mcp/route.ts
+++ b/vibn-frontend/app/api/mcp/route.ts
@@ -4491,28 +4491,44 @@ function shq(s: string): string {
  return `'${s.replace(/'/g, `'\\''`)}'`;
 }
-function normalizeFsPath(p: string): string | NextResponse {
+function normalizeFsPath(
  p: string,
  projectSlug?: string,
 ): string | NextResponse {
  if (!p || typeof p !== "string") {
    return NextResponse.json(
      { error: 'Param "path" is required' },
      { status: 400 },
    );
  }
  const projectRoot = projectSlug ? `${FS_ROOT}/${projectSlug}` : FS_ROOT;
  let abs: string;
  if (p.startsWith("/")) {
    abs = p;
  } else {
-    abs = `${FS_ROOT}/${p}`.replace(/\/+/g, "/");
+    abs = `${projectRoot}/${p}`.replace(/\/+/g, "/");
  }
  // Disallow .. traversal that escapes /workspace.
  const norm = abs.replace(/\/[^/]+\/\.\.(?=\/|$)/g, "").replace(/\/+/g, "/");
-  if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) {
+
-    return NextResponse.json(
+  // When projectSlug is set, REJECT paths outside the project root.
-      {
+  if (projectSlug) {
-        error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.`,
+    if (!norm.startsWith(projectRoot) && norm !== projectRoot) {
-      },
+      return NextResponse.json(
-      { status: 400 },
+        {
-    );
+          ok: false,
          error: `PATH_OUTSIDE_PROJECT: path "${p}" resolves to "${norm}" which is outside the active project at "${projectRoot}". Did you mean "${projectRoot}/${p.replace(/^\/+/, "")}"?`,
        },
        { status: 400 },
      );
    }
  } else {
    // Workspace-level fallback (legacy behaviour)
    if (!norm.startsWith(FS_ROOT) && norm !== FS_ROOT) {
      return NextResponse.json(
        { error: `Path "${p}" is outside ${FS_ROOT}; use shell.exec for system paths.` },
        { status: 400 },
      );
    }
  }
  return norm;
 }
@@ -4627,7 +4643,7 @@ async function toolFsRead(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? ""));
+  const path = normalizeFsPath(String(params.path ?? ""), project.slug);
  if (path instanceof NextResponse) return path;
  const offset = Number.isFinite(Number(params.offset))
@@ -4873,7 +4889,7 @@ async function toolFsWrite(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? ""));
+  const path = normalizeFsPath(String(params.path ?? ""), project.slug);
  if (path instanceof NextResponse) return path;
  const content = typeof params.content === "string" ? params.content : "";
@@ -4890,8 +4906,11 @@ async function toolFsWrite(principal: Principal, params: Record<string, any>) {
      { status: 500 },
    );
  }
  const { createHash } = require('crypto');
  const bytes = Buffer.byteLength(content, "utf8");
  const sha256 = createHash("sha256").update(content, "utf8").digest("hex");
  return NextResponse.json({
-    result: { path, bytesWritten: Buffer.byteLength(content, "utf8") },
+    result: { ok: true, path, bytes, sha256 },
  });
 }
@@ -4900,7 +4919,7 @@ async function toolFsEdit(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? ""));
+  const path = normalizeFsPath(String(params.path ?? ""), project.slug);
  if (path instanceof NextResponse) return path;
  const newString =
@@ -4966,7 +4985,7 @@ print(n)`;
  const b64 = Buffer.from(JSON.stringify(payload), "utf8").toString("base64");
  const pyB64 = Buffer.from(py, "utf8").toString("base64");
-  const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)"`;
+  const cmd = `python3 -c "$(printf %s ${shq(pyB64)} | base64 -d)" <<< "$(printf %s ${shq(b64)} | base64 -d)" && echo "---" && sha256sum ${shq(path)} | cut -d' ' -f1 && wc -c < ${shq(path)}`;
  const r = await runFsCmd(principal, project, cmd);
  if (r.code !== 0) {
@@ -4979,8 +4998,18 @@ print(n)`;
      { status },
    );
  }
  const stdoutParts = r.stdout.split('---');
  const replacementsStr = stdoutParts[0].trim();
  const hashAndSize = stdoutParts[1] ? stdoutParts[1].trim().split('\n') : [];
  return NextResponse.json({
-    result: { path, replacements: parseInt(r.stdout.trim() || "0", 10) },
+    result: { 
      ok: true, 
      path, 
      replacements: parseInt(replacementsStr || "0", 10),
      sha256: hashAndSize[0] ? hashAndSize[0].trim() : undefined,
      bytes: hashAndSize[1] ? parseInt(hashAndSize[1].trim(), 10) : undefined
    },
  });
 }
@@ -4989,7 +5018,7 @@ async function toolFsList(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? "/workspace"));
+  const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug);
  if (path instanceof NextResponse) return path;
  const cmd = `cd ${shq(path)} && ls -lA --time-style=long-iso 2>&1 | head -200`;
  const r = await runFsCmd(principal, project, cmd);
@@ -5003,7 +5032,7 @@ async function toolFsDelete(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? ""));
+  const path = normalizeFsPath(String(params.path ?? ""), project.slug);
  if (path instanceof NextResponse) return path;
  const recursive = Boolean(params.recursive);
  // Belt-and-suspenders: never let `rm -rf /workspace` itself slip through.
@@ -5036,7 +5065,7 @@ async function toolFsGlob(principal: Principal, params: Record<string, any>) {
      { status: 400 },
    );
  }
-  const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"));
+  const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug);
  if (cwd instanceof NextResponse) return cwd;
  // ripgrep --files --glob is faster + smarter than `find` and respects .gitignore.
  const cmd = `cd ${shq(cwd)} && rg --files --glob ${shq(pattern)} | head -500`;
@@ -5062,7 +5091,7 @@ async function toolFsGrep(principal: Principal, params: Record<string, any>) {
      { status: 400 },
    );
  }
-  const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"));
+  const cwd = normalizeFsPath(String(params.cwd ?? "/workspace"), project.slug);
  if (cwd instanceof NextResponse) return cwd;
  const glob =
    typeof params.glob === "string" && params.glob.trim()
@@ -6005,7 +6034,7 @@ async function toolFsTree(principal: Principal, params: Record<string, any>) {
  if (guard) return guard;
  const project = await resolveProjectOr404(principal, params);
  if (project instanceof NextResponse) return project;
-  const path = normalizeFsPath(String(params.path ?? "/workspace"));
+  const path = normalizeFsPath(String(params.path ?? "/workspace"), project.slug);
  if (path instanceof NextResponse) return path;
  // Use find to generate a tree structure, ignoring node_modules and .git