fix(preview): self-healing dev server so the preview always loads cleanly

The dominant production failure was a dead dev-server process behind a
'running' DB flag (idle-stop / OOM / crash / host restart), which the UI
trusted and embedded -> permanent 502 until a manual restart.

- dev-container.ts: add isDevServerListening() fast liveness probe; stop the
  container entrypoint from auto-running 'npx next dev --webpack' (it competed
  with the managed server, forced the wrong bundler/cwd, and doubled memory);
  drop the fake state='running' seed row; bump dev container memory 1g -> 2g.
- ensure route: verify a 'running' row is ACTUALLY listening and resurrect it
  if dead, instead of trusting the flag; never bounce a healthy server.
- preview page: call ensure on every mount and on refresh (verify + heal),
  force an immediate anatomy refetch on (re)start so a dead frame swaps to
  'warming up' without the 5s lag.

Backstopped by the partial unique index + startDevServer idempotency, so heals
can never duplicate or thrash a server.
This commit is contained in:
2026-06-12 17:30:27 -07:00
parent 0f90ef6f5c
commit 514f11e80d
3 changed files with 141 additions and 75 deletions

View File

@@ -51,7 +51,7 @@ export default function PreviewTab() {
// Poll every 5s so state transitions (starting→running, build complete, etc.)
// surface without a manual refresh.
const { anatomy, loading } = useAnatomy(projectId, { pollMs: 5000 });
const { anatomy, loading, reload } = useAnatomy(projectId, { pollMs: 5000 });
const previews = anatomy?.hosting.previews ?? [];
@@ -112,40 +112,19 @@ export default function PreviewTab() {
: `https://${fallbackFqdn}`
: null;
// ── Auto-ensure: fire a background restart when the pane loads and finds
// no running dev server, but there's a previous config to restart from.
// ── Auto-ensure: the single entry point that guarantees the preview is live.
// We call it on every mount — even when anatomy already says "running" —
// because a `running` row is only intent; the process may have died
// (idle-stop / OOM / crash / host restart) leaving a dead port behind a
// stale flag. The `ensure` endpoint verifies the port is ACTUALLY answering
// and resurrects it if not, but never bounces a healthy server. That makes
// "open the preview → it loads cleanly" reliable, and keeps the container
// warm (the liveness probe touches activity).
const ensureCalledRef = useRef(false);
const [ensureStatus, setEnsureStatus] = useState<
"idle" | "calling" | "starting" | "no_history" | "error"
"idle" | "calling" | "starting" | "running" | "no_history" | "error"
>("idle");
useEffect(() => {
// Only trigger once per mount, and only when anatomy has loaded with no running server.
if (ensureCalledRef.current) return;
if (loading || !anatomy) return;
if (primaryRunning || primaryStarting) return; // already up or already starting
ensureCalledRef.current = true;
fetch(`/api/projects/${projectId}/dev-server/ensure`, {
method: "POST",
credentials: "include",
})
.then((r) => r.json())
.then((data: { status?: string }) => {
if (data.status === "no_history" || data.status === "no_container") {
setEnsureStatus("no_history");
} else if (data.status === "starting" || data.status === "running") {
setEnsureStatus("starting");
// The 5s anatomy poll will pick up the new 'starting' row and
// transition the pane automatically — no extra work needed here.
} else {
setEnsureStatus("idle");
}
})
.catch(() => setEnsureStatus("error"));
}, [loading, anatomy, primaryRunning, primaryStarting, projectId]);
const [iframeSrc, setIframeSrc] = useState<string | null>(null);
const iframeDomRef = useRef<HTMLIFrameElement | null>(null);
const bridge = usePreviewBridge();
@@ -157,21 +136,56 @@ export default function PreviewTab() {
const [isForceStarting, setIsForceStarting] = useState(false);
// When the user clicks the manual refresh button in the toolbar, we don't
// just want to reload the iframe — we also want to trigger the same ghost/zombie
// check as the initial mount, in case the server died while they were looking at it.
const prevRefreshKeyRef = useRef(refreshKey);
// Auto-ensure + refresh-heal, in one effect.
//
// On mount (and whenever the refresh button bumps `refreshKey`) we hit the
// `ensure` endpoint, which is the single entry point that guarantees the
// preview is live. We call it even when anatomy already says "running",
// because a `running` row is only intent — the process may have died
// (idle-stop / OOM / crash / host restart) leaving a dead port behind a stale
// flag. `ensure` verifies the port is ACTUALLY answering and resurrects it if
// not, but never bounces a healthy server (and the unique index +
// `startDevServer` idempotency mean it can't duplicate one). So "open the
// preview" and "click refresh" both reliably land on a clean, loaded app.
//
// The re-arm is a ref write (not setState), so the effect body stays free of
// synchronous state updates; the only setState calls live in async callbacks.
const lastEnsuredRefreshKeyRef = useRef(refreshKey);
useEffect(() => {
if (refreshKey === prevRefreshKeyRef.current) return;
prevRefreshKeyRef.current = refreshKey;
// We only reset the ensure flag if we aren't currently waiting for a forced start.
// If they hit refresh while it's already booting, don't break the state machine.
if (!isForceStarting) {
if (refreshKey !== lastEnsuredRefreshKeyRef.current && !isForceStarting) {
lastEnsuredRefreshKeyRef.current = refreshKey;
ensureCalledRef.current = false;
setEnsureStatus("idle");
}
}, [refreshKey, isForceStarting]);
if (ensureCalledRef.current) return;
if (loading || !anatomy) return;
ensureCalledRef.current = true;
fetch(`/api/projects/${projectId}/dev-server/ensure`, {
method: "POST",
credentials: "include",
})
.then((r) => r.json())
.then((data: { status?: string }) => {
if (data.status === "no_history" || data.status === "no_container") {
setEnsureStatus("no_history");
} else if (data.status === "running") {
// Verified live — keep showing the iframe.
setEnsureStatus("running");
} else if (data.status === "starting") {
// Fresh start or resurrection of a dead server. Flip to warming-up and
// force an immediate anatomy refetch: `ensure` has already marked any
// stale/dead `running` row as stopped, so the refetch drops the
// possibly-502 iframe and shows warming-up without waiting for the 5s
// poll. The readiness probe then carries it to a clean load.
setEnsureStatus("starting");
reload();
} else {
setEnsureStatus("idle");
}
})
.catch(() => setEnsureStatus("error"));
}, [loading, anatomy, projectId, refreshKey, isForceStarting, reload]);
useLayoutEffect(() => {
if (!primaryRunning?.url) {

View File

@@ -20,6 +20,7 @@ import {
ensureDevContainer,
startDevServer,
probeDevServerReadiness,
isDevServerListening,
} from "@/lib/dev-container";
export async function POST(
@@ -55,8 +56,8 @@ export async function POST(
const projectSlug = (project.data?.slug as string) || project.id;
const projectName = (project.data?.name as string) || "Project";
// 1. Is a dev server already running or starting on the primary port?
const running = await queryOne<{
// 1. Is a dev server already active on the primary port?
const active = await queryOne<{
id: string;
state: string;
preview_url: string;
@@ -75,15 +76,39 @@ export async function POST(
[projectId],
);
if (running) {
// A `starting` row is mid cold-boot; the readiness probe will promote it to
// `running` once the port answers. Don't disturb it.
if (active?.state === "starting") {
return NextResponse.json({
status: running.state === "running" ? "running" : "starting",
previewUrl: running.preview_url,
command: running.command,
port: running.port,
status: "starting",
previewUrl: active.preview_url,
command: active.command,
port: active.port,
});
}
// A `running` row is only a record of intent. Verify the process is ACTUALLY
// listening — it may have died from idle-stop / OOM / crash / host restart,
// which is the #1 cause of "preview was up, now it's a 502". Only return
// `running` if the port truly answers; otherwise fall through and resurrect.
if (active?.state === "running") {
const alive = await isDevServerListening(projectId, active.port);
if (alive) {
return NextResponse.json({
status: "running",
previewUrl: active.preview_url,
command: active.command,
port: active.port,
});
}
// Dead behind a stale flag. Mark it stopped so the UI stops embedding the
// 502 URL, then fall through to restart it with the same command below.
await query(
`UPDATE fs_dev_servers SET state = 'stopped', stopped_at = now() WHERE id = $1`,
[active.id],
);
}
// 2. Do we have a previous config to restart from?
// (Limit to port 3000 since that's what the preview pane embeds)
const last = await queryOne<{
@@ -104,7 +129,13 @@ export async function POST(
// If there's no history, we STILL want to auto-start! We just assume it's a standard
// Next.js app on port 3000. Forcing the user to hit "Start Preview" on a new project
// is unnecessary friction.
const commandToRun = last?.command || "npx next dev -H 0.0.0.0 --webpack";
//
// Do NOT inject `--webpack`: that overrides the project's own bundler choice
// (Next 16 defaults to Turbopack) and forced the dev server to disagree with
// the project's `package.json` dev script. The default mirrors the script the
// scaffolds actually ship (`next dev -H 0.0.0.0`); a real `last.command` from
// a prior managed start always takes precedence anyway.
const commandToRun = last?.command || "npx next dev -H 0.0.0.0";
const portToRun = last?.port || 3000;
const previewUrlToUse = last?.preview_url ?? null;

View File

@@ -51,7 +51,7 @@ export const VIBN_DEV_IMAGE = process.env.VIBN_DEV_IMAGE ?? "vibn-dev:latest";
/** Resource caps per dev container. Tweak in env per-tier later. */
const DEFAULT_CPU_LIMIT = process.env.VIBN_DEV_CPU_LIMIT ?? "1"; // 1 vCPU
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "1g"; // 1 GiB
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "2g"; // 2 GiB — a single Next dev (Turbopack) + npm install OOM-kills at 1 GiB
const DEFAULT_DISK_LIMIT = process.env.VIBN_DEV_DISK_LIMIT ?? "10g"; // soft hint, not enforced by compose
// ── Schema ───────────────────────────────────────────────────────────
@@ -186,7 +186,7 @@ function renderDevCompose(projectSlug: string, projectId: string): string {
image: ${VIBN_DEV_IMAGE}
pull_policy: never
restart: unless-stopped
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ]; then echo 'Found package.json, checking deps...'; if [ ! -d /workspace/node_modules ]; then npm install; fi; echo 'Starting dev server...'; npx next dev -H 0.0.0.0 --webpack; else echo 'No package.json found. Standing by...'; sleep infinity; fi"]
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ] && [ ! -d /workspace/node_modules ]; then echo 'Installing root dependencies...'; npm install; fi; echo 'Container ready — dev server is managed externally via dev_server_start.'; sleep infinity"]
working_dir: /workspace
volumes:
- workspace:/workspace
@@ -336,28 +336,14 @@ export async function ensureDevContainer(
],
);
// In Path 2, the dev container natively runs the Next.js server on port 3000.
// We automatically inject the static preview tracking row so the UI sees it instantly.
const previewUrl = buildPreviewUrl(opts.projectId, opts.projectSlug, 3000);
if (previewUrl) {
await query(
`INSERT INTO fs_dev_servers
(id, project_id, workspace, name, command, port, preview_url, state)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT (id) DO UPDATE
SET state = EXCLUDED.state`,
[
`ds_primary_${opts.projectId.replace(/-/g, "").slice(0, 10)}`,
opts.projectId,
opts.workspace.slug,
"Primary App",
"npx next dev -H 0.0.0.0 --webpack",
3000,
previewUrl,
"running",
],
);
}
// NOTE: We deliberately do NOT seed a `state='running'` dev-server row here.
// The container boots to standby (`sleep infinity`) and the dev server is
// started lazily and exclusively by the managed flow (the preview pane's
// auto-ensure or the AI's `dev_server_start`). Seeding a fake "running" row
// pointed at a server that isn't actually listening produced 502s, and it
// competed with the managed start for port 3000. `startDevServer` +
// `probeDevServerReadiness` now own the row's lifecycle and only mark it
// `running` once the port truly answers.
// Bookkeeping link so apps_list / projects_get see the dev container
// under the right Vibn project.
@@ -805,6 +791,41 @@ export function ensurePreviewListenAllInterfaces(command: string): string {
return universalEnv + cmd;
}
/**
* Fast one-shot liveness check: is *something* answering HTTP on `port` inside
* the dev container right now? Any HTTP status (even 404/500) counts as alive;
* only a refused/timed-out connection (curl yields `000`) means dead. Worst case
* ~3s.
*
* This exists because a `state='running'` row in fs_dev_servers is only a record
* of intent — the actual process can die out from under it (container idle-stop,
* OOM-kill, crash, host restart) with nothing to update the row. Trusting the
* flag blindly makes the preview embed a dead URL → 502. Callers use this to
* verify-then-resurrect instead.
*/
export async function isDevServerListening(
projectId: string,
port: number,
): Promise<boolean> {
try {
const r = await execInDevContainer({
projectId,
command:
`code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
`"http://localhost:${port}/" 2>/dev/null || ` +
`curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
`"http://0.0.0.0:${port}/" 2>/dev/null || printf '000'); ` +
`[ "$code" != "000" ] && [ -n "$code" ] && echo LIVE || echo DEAD`,
timeoutMs: 8_000,
});
return /LIVE/.test(r.stdout);
} catch {
// Container itself is unreachable (down/provisioning). Report not-listening
// so the caller takes the (re)start path rather than embedding a dead iframe.
return false;
}
}
/**
* Poll localhost inside the container until the dev server answers or time out.
* Promotes `starting` → `running` / `failed` in fs_dev_servers. Intended to be