fix(preview): self-healing dev server so the preview always loads cleanly

The dominant production failure was a dead dev-server process behind a
'running' DB flag (idle-stop / OOM / crash / host restart), which the UI
trusted and embedded -> permanent 502 until a manual restart.

- dev-container.ts: add isDevServerListening() fast liveness probe; stop the
  container entrypoint from auto-running 'npx next dev --webpack' (it competed
  with the managed server, forced the wrong bundler/cwd, and doubled memory);
  drop the fake state='running' seed row; bump dev container memory 1g -> 2g.
- ensure route: verify a 'running' row is ACTUALLY listening and resurrect it
  if dead, instead of trusting the flag; never bounce a healthy server.
- preview page: call ensure on every mount and on refresh (verify + heal),
  force an immediate anatomy refetch on (re)start so a dead frame swaps to
  'warming up' without the 5s lag.

Backstopped by the partial unique index + startDevServer idempotency, so heals
can never duplicate or thrash a server.
This commit is contained in:
2026-06-12 17:30:27 -07:00
parent 0f90ef6f5c
commit 514f11e80d
3 changed files with 141 additions and 75 deletions

View File

@@ -51,7 +51,7 @@ export const VIBN_DEV_IMAGE = process.env.VIBN_DEV_IMAGE ?? "vibn-dev:latest";
/** Resource caps per dev container. Tweak in env per-tier later. */
const DEFAULT_CPU_LIMIT = process.env.VIBN_DEV_CPU_LIMIT ?? "1"; // 1 vCPU
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "1g"; // 1 GiB
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "2g"; // 2 GiB — a single Next dev (Turbopack) + npm install OOM-kills at 1 GiB
const DEFAULT_DISK_LIMIT = process.env.VIBN_DEV_DISK_LIMIT ?? "10g"; // soft hint, not enforced by compose
// ── Schema ───────────────────────────────────────────────────────────
@@ -186,7 +186,7 @@ function renderDevCompose(projectSlug: string, projectId: string): string {
image: ${VIBN_DEV_IMAGE}
pull_policy: never
restart: unless-stopped
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ]; then echo 'Found package.json, checking deps...'; if [ ! -d /workspace/node_modules ]; then npm install; fi; echo 'Starting dev server...'; npx next dev -H 0.0.0.0 --webpack; else echo 'No package.json found. Standing by...'; sleep infinity; fi"]
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ] && [ ! -d /workspace/node_modules ]; then echo 'Installing root dependencies...'; npm install; fi; echo 'Container ready — dev server is managed externally via dev_server_start.'; sleep infinity"]
working_dir: /workspace
volumes:
- workspace:/workspace
@@ -336,28 +336,14 @@ export async function ensureDevContainer(
],
);
// In Path 2, the dev container natively runs the Next.js server on port 3000.
// We automatically inject the static preview tracking row so the UI sees it instantly.
const previewUrl = buildPreviewUrl(opts.projectId, opts.projectSlug, 3000);
if (previewUrl) {
await query(
`INSERT INTO fs_dev_servers
(id, project_id, workspace, name, command, port, preview_url, state)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT (id) DO UPDATE
SET state = EXCLUDED.state`,
[
`ds_primary_${opts.projectId.replace(/-/g, "").slice(0, 10)}`,
opts.projectId,
opts.workspace.slug,
"Primary App",
"npx next dev -H 0.0.0.0 --webpack",
3000,
previewUrl,
"running",
],
);
}
// NOTE: We deliberately do NOT seed a `state='running'` dev-server row here.
// The container boots to standby (`sleep infinity`) and the dev server is
// started lazily and exclusively by the managed flow (the preview pane's
// auto-ensure or the AI's `dev_server_start`). Seeding a fake "running" row
// pointed at a server that isn't actually listening produced 502s, and it
// competed with the managed start for port 3000. `startDevServer` +
// `probeDevServerReadiness` now own the row's lifecycle and only mark it
// `running` once the port truly answers.
// Bookkeeping link so apps_list / projects_get see the dev container
// under the right Vibn project.
@@ -805,6 +791,41 @@ export function ensurePreviewListenAllInterfaces(command: string): string {
return universalEnv + cmd;
}
/**
* Fast one-shot liveness check: is *something* answering HTTP on `port` inside
* the dev container right now? Any HTTP status (even 404/500) counts as alive;
* only a refused/timed-out connection (curl yields `000`) means dead. Worst case
* ~3s.
*
* This exists because a `state='running'` row in fs_dev_servers is only a record
* of intent — the actual process can die out from under it (container idle-stop,
* OOM-kill, crash, host restart) with nothing to update the row. Trusting the
* flag blindly makes the preview embed a dead URL → 502. Callers use this to
* verify-then-resurrect instead.
*/
export async function isDevServerListening(
projectId: string,
port: number,
): Promise<boolean> {
try {
const r = await execInDevContainer({
projectId,
command:
`code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
`"http://localhost:${port}/" 2>/dev/null || ` +
`curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
`"http://0.0.0.0:${port}/" 2>/dev/null || printf '000'); ` +
`[ "$code" != "000" ] && [ -n "$code" ] && echo LIVE || echo DEAD`,
timeoutMs: 8_000,
});
return /LIVE/.test(r.stdout);
} catch {
// Container itself is unreachable (down/provisioning). Report not-listening
// so the caller takes the (re)start path rather than embedding a dead iframe.
return false;
}
}
/**
* Poll localhost inside the container until the dev server answers or time out.
* Promotes `starting` → `running` / `failed` in fs_dev_servers. Intended to be