From 0f212c750b570f215c6db21918c31b929d3f9b7a Mon Sep 17 00:00:00 2001 From: mawkone Date: Fri, 12 Jun 2026 18:05:16 -0700 Subject: [PATCH] fix(preview): stop refresh-flicker false-restarts + harden dev container & agent - isDevServerListening: key off curl EXIT CODE not response time. The 2s max-time treated a busy/compiling-but-listening dev server as DEAD, so ensure restarted a healthy server on every refresh -> cold compile -> the 502/no-CSS/broken-images/perfect flicker. Now dead only when BOTH localhost and 0.0.0.0 refuse the connection (curl exit 7). - ensure route: liveness probe is fail-safe (try/catch) -> never 500s or needlessly restarts on a probe error; trusts the DB flag instead. - dev container: reconcile dead orphan containers before resume/start so a leftover name no longer triggers 'container name already in use' -> Traefik gateway timeout. - dev container: inject AUTH_SECRET / NEXTAUTH_SECRET / AUTH_TRUST_HOST so scaffolded NextAuth apps stop throwing [auth][error] MissingSecret in preview. - chat prompt: don't bounce a healthy dev server; only claim actions a tool actually performed (no hallucinated DB deletes); NextAuth previews pre-wired. - intent budgets: route 'not appearing/showing/missing' to diagnose; bump status_check 12->16, diagnose 15->22 so investigations don't hit the cap. --- vibn-frontend/app/api/chat/route.ts | 13 ++-- .../[projectId]/dev-server/ensure/route.ts | 16 ++++- vibn-frontend/lib/dev-container.ts | 65 +++++++++++++++++-- 3 files changed, 81 insertions(+), 13 deletions(-) diff --git a/vibn-frontend/app/api/chat/route.ts b/vibn-frontend/app/api/chat/route.ts index a7b0b8a..271491b 100644 --- a/vibn-frontend/app/api/chat/route.ts +++ b/vibn-frontend/app/api/chat/route.ts @@ -58,8 +58,8 @@ const TOOL_BUDGETS: Record = { // 5/8 were cutting these off at the cap before the model could answer // (telemetry showed 100% round_cap on these turns). Raised so a read-only // investigation can actually finish. - status_check: 12, - diagnose: 15, + status_check: 16, + diagnose: 22, small_fix: 18, feature_build: 40, deploy: 25, @@ -94,7 +94,7 @@ function classifyTurnIntent(message: string): TurnIntent { // a 40-round build task. Without these, "I get a gateway timeout" falls through // to feature_build and burns 40 rounds looping on a dead dev server. if ( - /(why|broken|error|blank|not loading|fail|bug|issue|doesn't work|isn't working|fix|time.?out|tim(?:es?|ed|ing) out|gateway|5[0-2][0-9]|connection (refused|reset|failed)|unreachable|can.?t connect|cannot connect|not respond)/.test( + /(why|broken|error|blank|not loading|not (showing|appearing|visible|rendering)|isn'?t (showing|appearing|visible|rendering)|missing|disappeared|gone|won'?t load|fail|bug|issue|doesn't work|isn't working|fix|time.?out|tim(?:es?|ed|ing) out|gateway|5[0-2][0-9]|connection (refused|reset|failed)|unreachable|can.?t connect|cannot connect|not respond)/.test( m, ) ) @@ -427,10 +427,10 @@ If the user tells you the preview is blank, not loading, or shows nothing: **HMR through the proxy (apply when scaffolding):** - **Vite (verified working):** in \`vite.config\` set \`server: { host: '0.0.0.0', port: <3000-3009>, strictPort: true, hmr: { clientPort: 443, protocol: 'wss', host: '' } }\`. The \`hmr.host\` is REQUIRED — without it Vite's HMR client can guess the wrong host and the WS handshake fails through Traefik. Default localhost binding looks fine locally but breaks HMR through the proxy. -- **Next dev:** \`next dev -H 0.0.0.0 --no-turbopack\` (WSS HMR works automatically through the proxy without extra config). **Always use \`--no-turbopack\`** — Turbopack\'s per-route lazy compilation causes cold-start 503s in the remote container (the health probe passes on \`/\` but unvisited routes hang on first hit until Turbopack compiles them). webpack compiles all routes upfront and is significantly more stable in a containerised environment. +- **Next dev:** \`npx next dev -H 0.0.0.0 --no-turbopack\` (WSS HMR works automatically through the proxy without extra config). **Always use \`--no-turbopack\`** — Turbopack\'s per-route lazy compilation causes cold-start 503s in the remote container (the health probe passes on \`/\` but unvisited routes hang on first hit until Turbopack compiles them). webpack compiles all routes upfront and is significantly more stable in a containerised environment. - **Express / plain Node:** bind \`0.0.0.0\` (we set \`HOST=0.0.0.0\` env, but verify your framework respects it). -**Build-me-X recipe:** \`devcontainer_ensure\` → \`apps_templates_scaffold { templateName }\` (if matching "dashboard" or "pitch-deck") OR \`shell_exec npx create-next-app@latest . --yes\` → \`fs_edit\` / \`fs_write\` to customize → **wire Sentry (see below)** → \`dev_server_start { command: 'next dev -H 0.0.0.0 --no-turbopack', port: 3000 }\` and **share the previewUrl in your reply — that's the turn's stopping point**. When the user says "ship it", call \`ship { projectId, commitMsg }\` (commits to Gitea and triggers prod deploy in one shot). If a project is multi-service (frontend + API + worker), pick the user-facing service (usually the frontend) and start ITS dev server first, even if the others aren't done yet — a clickable shell beats a complete-but-invisible stack. +**Build-me-X recipe:** \`devcontainer_ensure\` → \`apps_templates_scaffold { templateName }\` (if matching "dashboard" or "pitch-deck") OR \`shell_exec npx create-next-app@latest . --yes\` → \`fs_edit\` / \`fs_write\` to customize → **wire Sentry (see below)** → \`dev_server_start { command: 'npx next dev -H 0.0.0.0 --no-turbopack', port: 3000 }\` and **share the previewUrl in your reply — that's the turn's stopping point**. When the user says "ship it", call \`ship { projectId, commitMsg }\` (commits to Gitea and triggers prod deploy in one shot). If a project is multi-service (frontend + API + worker), pick the user-facing service (usually the frontend) and start ITS dev server first, even if the others aren't done yet — a clickable shell beats a complete-but-invisible stack. **Sentry is auto-provisioned per Vibn project.** When you scaffold a Next.js or Vite app, wire Sentry from day one so the user gets de-minified error capture + Session Replay on first deploy. The DSN (\`NEXT_PUBLIC_SENTRY_DSN\`) and shared org auth token (\`SENTRY_AUTH_TOKEN\`) are injected into the Coolify app's env automatically by \`apps_create\` — you don't set them. Get the project's Sentry slug from \`projects_get { projectId }\` (field: \`sentry.slug\`); pass it to \`withSentryConfig({ org: "vibnai", project: "", ... })\`. The reference recipe (instrumentation.ts, instrumentation-client.ts, app/global-error.tsx, next.config.ts wrapper, Dockerfile ARG declarations) is in \`vibn-frontend/lib/scaffold/sentry-snippets.ts\` — read it once via \`fs_*\` if you're unsure, then copy the snippets into the user's project verbatim. Skip Sentry for non-app projects (CLIs, library-only repos). @@ -491,6 +491,9 @@ The project's requirements, features list, specifications, and backlog checklist - Long-running ops (deploys, DNS, DB provisioning) take 1–5 min — tell the user up front. Don't tight-loop polling. - After \`ship\` or \`apps_deploy\`, the result is authoritative. Don't call \`gitea_*\` / \`shell_exec\` / \`apps_*\` to "verify" — read the response and report. - Never fake success. Never imply something worked if it didn't. +- **Don't bounce a healthy dev server.** If \`dev_server_list\` or a \`healthCheck.status === 200\` shows it serving, the server is HEALTHY — a user saying "I don't see it," a cached 502, or a transient client timeout is NOT a reason to \`dev_server_stop\`/restart. Restarting a working server churns the container and CAUSES the gateway timeouts you're trying to fix. Re-verify by fetching the actual preview URL first; only restart if the port genuinely isn't answering. +- **Only claim what a tool actually did.** If you lack a tool for what the user asked (e.g. deleting a Vibn platform-provisioned database — \`databases_delete\` only removes Coolify-managed DBs and returns \`[]\` for platform DBs), say so plainly and stop. NEVER report a delete / create / merge / deploy as done without a matching success result from THIS turn. Claiming you deleted something you cannot delete destroys trust faster than any bug. +- **NextAuth previews are pre-wired.** The dev container already injects \`AUTH_SECRET\` / \`NEXTAUTH_SECRET\` / \`AUTH_TRUST_HOST\`, so never add MissingSecret hacks in the preview. When you add a page or scaffold auth, make sure \`src/middleware.ts\` does not redirect the new route to \`/login\`: protect by an explicit public-route allowlist (\`/\`, the new page, \`/api/auth/*\`, static assets), not block-everything. ${activeBlock}${briefBlock}## Current workspace projects ${projectsText} diff --git a/vibn-frontend/app/api/projects/[projectId]/dev-server/ensure/route.ts b/vibn-frontend/app/api/projects/[projectId]/dev-server/ensure/route.ts index d0ae4f2..73902e3 100644 --- a/vibn-frontend/app/api/projects/[projectId]/dev-server/ensure/route.ts +++ b/vibn-frontend/app/api/projects/[projectId]/dev-server/ensure/route.ts @@ -92,7 +92,21 @@ export async function POST( // which is the #1 cause of "preview was up, now it's a 502". Only return // `running` if the port truly answers; otherwise fall through and resurrect. if (active?.state === "running") { - const alive = await isDevServerListening(projectId, active.port); + // Fail SAFE: if the liveness probe itself errors (SSH down, transient + // hiccup, etc.) we must NOT 500 the route or needlessly restart a server + // that may be fine — either would break/flicker the preview. Default to + // trusting the DB flag and only treat the server as dead on a definitive + // "not listening" result. + let alive = true; + try { + alive = await isDevServerListening(projectId, active.port); + } catch (err) { + console.error( + "[dev-server/ensure] liveness probe errored; trusting DB state:", + err instanceof Error ? err.message : err, + ); + alive = true; + } if (alive) { return NextResponse.json({ status: "running", diff --git a/vibn-frontend/lib/dev-container.ts b/vibn-frontend/lib/dev-container.ts index f3ec452..6d5d590 100644 --- a/vibn-frontend/lib/dev-container.ts +++ b/vibn-frontend/lib/dev-container.ts @@ -31,7 +31,8 @@ import { getService, } from "@/lib/coolify"; import { execInCoolifyApp, type ExecInAppResult } from "@/lib/coolify-exec"; -import { isCoolifySshConfigured } from "@/lib/coolify-ssh"; +import { isCoolifySshConfigured, runOnCoolifyHost } from "@/lib/coolify-ssh"; +import { createHash } from "node:crypto"; import { ensureProjectCoolifyProject, getProjectCoolifyUuid, @@ -143,6 +144,36 @@ function projectPreviewToken(projectId: string): string { return Buffer.from(projectId).toString("hex").slice(0, 8); } +// Deterministic, hard-to-guess secret per project, injected into the dev +// container so scaffolded NextAuth apps never throw `[auth][error] MissingSecret` +// in the PREVIEW. This is for dev/preview only — production auth uses the real +// AUTH_SECRET on the deployed Coolify app, never this. Deterministic so it +// survives container restarts without needing a DB column. +function devAuthSecret(projectId: string): string { + const salt = process.env.VIBN_DEV_AUTH_SALT ?? "vibn-dev-auth-v1"; + return createHash("sha256").update(`${salt}:${projectId}`).digest("hex"); +} + +// Before (re)starting a dev container, clear any DEAD orphan container that is +// still holding this service's Coolify-assigned name. Coolify names every +// container of a resource with the resource uuid as a suffix (e.g. +// `vibn-dev-`); a prior suspend/deploy can leave an exited container under +// that name, so the next start fails with "Conflict. The container name … is +// already in use" and Traefik loses its backend (the user sees a gateway +// timeout). We remove ONLY non-running containers (exited/created/dead) — never +// a live one — so a healthy container is never killed. Best-effort + SSH-gated. +async function reconcileDevContainerOrphans( + serviceUuid: string, +): Promise { + if (!serviceUuid || !isCoolifySshConfigured()) return; + const nameFilter = `name=-${serviceUuid}$`; + const cmd = + `docker ps -a --filter '${nameFilter}' ` + + `--filter status=exited --filter status=created --filter status=dead -q ` + + `| xargs -r docker rm -f`; + await runOnCoolifyHost(cmd, { timeoutMs: 15_000 }).catch(() => {}); +} + function renderDevCompose(projectSlug: string, projectId: string): string { // Image distribution: we build vibn-dev on the Coolify host once // (see /vibn-dev/setup-on-coolify.sh) and reference it locally. @@ -196,6 +227,13 @@ function renderDevCompose(projectSlug: string, projectId: string): string { - VIBN_PROJECT_ID=${projectId} - VIBN_PREVIEW_TOKEN=${token} - VIBN_DEV_CONTAINER=1 + # Make scaffolded NextAuth apps work in the preview out of the box. + # AUTH_SECRET (NextAuth v5) / NEXTAUTH_SECRET (v4) prevent the + # "[auth][error] MissingSecret" crash; AUTH_TRUST_HOST lets v5 trust the + # Traefik-proxied preview host. Dev/preview only — prod uses its own secret. + - AUTH_SECRET=${devAuthSecret(projectId)} + - NEXTAUTH_SECRET=${devAuthSecret(projectId)} + - AUTH_TRUST_HOST=true networks: - vibn-dev-net - coolify @@ -386,6 +424,9 @@ export async function resumeDevContainer(projectId: string): Promise { const row = await getDevContainerRow(projectId); if (!row) return; if (row.state === "running") return; + // Clear any dead orphan holding the container name so the start can't fail + // with a "container name already in use" conflict (which strands Traefik). + await reconcileDevContainerOrphans(row.service_uuid); await startService(row.service_uuid); await query( `UPDATE fs_project_dev_containers @@ -808,15 +849,25 @@ export async function isDevServerListening( port: number, ): Promise { try { + // CRITICAL: distinguish "port not listening" (truly dead) from "listening but + // slow to respond" (ALIVE — a Next.js/Vite dev server mid route-compile can + // take many seconds to answer `/`). We must NOT treat slowness as death: + // doing so made `ensure` restart a healthy-but-busy server on every refresh, + // and each restart cold-compiles, flickering the preview through + // 502 -> no-CSS -> broken-images -> perfect. + // + // We therefore key off curl's EXIT CODE, not response time. Exit 7 + // (CURLE_COULDNT_CONNECT) is the only definitive "nothing is bound to this + // port" signal. A slow response yields exit 28 (timeout) / 52 / 56 etc., all + // of which mean the socket accepted us => the server is up. Dead only when + // BOTH localhost and 0.0.0.0 refuse the connection. const r = await execInDevContainer({ projectId, command: - `code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` + - `"http://localhost:${port}/" 2>/dev/null || ` + - `curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` + - `"http://0.0.0.0:${port}/" 2>/dev/null || printf '000'); ` + - `[ "$code" != "000" ] && [ -n "$code" ] && echo LIVE || echo DEAD`, - timeoutMs: 8_000, + `curl -s -o /dev/null --connect-timeout 2 --max-time 4 "http://localhost:${port}/" 2>/dev/null; a=$?; ` + + `curl -s -o /dev/null --connect-timeout 2 --max-time 4 "http://0.0.0.0:${port}/" 2>/dev/null; b=$?; ` + + `if [ "$a" = "7" ] && [ "$b" = "7" ]; then echo DEAD; else echo LIVE; fi`, + timeoutMs: 12_000, }); return /LIVE/.test(r.stdout); } catch {