fix(preview): self-healing dev server so the preview always loads cleanly
The dominant production failure was a dead dev-server process behind a 'running' DB flag (idle-stop / OOM / crash / host restart), which the UI trusted and embedded -> permanent 502 until a manual restart. - dev-container.ts: add isDevServerListening() fast liveness probe; stop the container entrypoint from auto-running 'npx next dev --webpack' (it competed with the managed server, forced the wrong bundler/cwd, and doubled memory); drop the fake state='running' seed row; bump dev container memory 1g -> 2g. - ensure route: verify a 'running' row is ACTUALLY listening and resurrect it if dead, instead of trusting the flag; never bounce a healthy server. - preview page: call ensure on every mount and on refresh (verify + heal), force an immediate anatomy refetch on (re)start so a dead frame swaps to 'warming up' without the 5s lag. Backstopped by the partial unique index + startDevServer idempotency, so heals can never duplicate or thrash a server.
This commit is contained in:
@@ -51,7 +51,7 @@ export default function PreviewTab() {
|
||||
|
||||
// Poll every 5s so state transitions (starting→running, build complete, etc.)
|
||||
// surface without a manual refresh.
|
||||
const { anatomy, loading } = useAnatomy(projectId, { pollMs: 5000 });
|
||||
const { anatomy, loading, reload } = useAnatomy(projectId, { pollMs: 5000 });
|
||||
|
||||
const previews = anatomy?.hosting.previews ?? [];
|
||||
|
||||
@@ -112,40 +112,19 @@ export default function PreviewTab() {
|
||||
: `https://${fallbackFqdn}`
|
||||
: null;
|
||||
|
||||
// ── Auto-ensure: fire a background restart when the pane loads and finds
|
||||
// no running dev server, but there's a previous config to restart from.
|
||||
// ── Auto-ensure: the single entry point that guarantees the preview is live.
|
||||
// We call it on every mount — even when anatomy already says "running" —
|
||||
// because a `running` row is only intent; the process may have died
|
||||
// (idle-stop / OOM / crash / host restart) leaving a dead port behind a
|
||||
// stale flag. The `ensure` endpoint verifies the port is ACTUALLY answering
|
||||
// and resurrects it if not, but never bounces a healthy server. That makes
|
||||
// "open the preview → it loads cleanly" reliable, and keeps the container
|
||||
// warm (the liveness probe touches activity).
|
||||
const ensureCalledRef = useRef(false);
|
||||
const [ensureStatus, setEnsureStatus] = useState<
|
||||
"idle" | "calling" | "starting" | "no_history" | "error"
|
||||
"idle" | "calling" | "starting" | "running" | "no_history" | "error"
|
||||
>("idle");
|
||||
|
||||
useEffect(() => {
|
||||
// Only trigger once per mount, and only when anatomy has loaded with no running server.
|
||||
if (ensureCalledRef.current) return;
|
||||
if (loading || !anatomy) return;
|
||||
if (primaryRunning || primaryStarting) return; // already up or already starting
|
||||
|
||||
ensureCalledRef.current = true;
|
||||
|
||||
fetch(`/api/projects/${projectId}/dev-server/ensure`, {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
})
|
||||
.then((r) => r.json())
|
||||
.then((data: { status?: string }) => {
|
||||
if (data.status === "no_history" || data.status === "no_container") {
|
||||
setEnsureStatus("no_history");
|
||||
} else if (data.status === "starting" || data.status === "running") {
|
||||
setEnsureStatus("starting");
|
||||
// The 5s anatomy poll will pick up the new 'starting' row and
|
||||
// transition the pane automatically — no extra work needed here.
|
||||
} else {
|
||||
setEnsureStatus("idle");
|
||||
}
|
||||
})
|
||||
.catch(() => setEnsureStatus("error"));
|
||||
}, [loading, anatomy, primaryRunning, primaryStarting, projectId]);
|
||||
|
||||
const [iframeSrc, setIframeSrc] = useState<string | null>(null);
|
||||
const iframeDomRef = useRef<HTMLIFrameElement | null>(null);
|
||||
const bridge = usePreviewBridge();
|
||||
@@ -157,21 +136,56 @@ export default function PreviewTab() {
|
||||
|
||||
const [isForceStarting, setIsForceStarting] = useState(false);
|
||||
|
||||
// When the user clicks the manual refresh button in the toolbar, we don't
|
||||
// just want to reload the iframe — we also want to trigger the same ghost/zombie
|
||||
// check as the initial mount, in case the server died while they were looking at it.
|
||||
const prevRefreshKeyRef = useRef(refreshKey);
|
||||
// Auto-ensure + refresh-heal, in one effect.
|
||||
//
|
||||
// On mount (and whenever the refresh button bumps `refreshKey`) we hit the
|
||||
// `ensure` endpoint, which is the single entry point that guarantees the
|
||||
// preview is live. We call it even when anatomy already says "running",
|
||||
// because a `running` row is only intent — the process may have died
|
||||
// (idle-stop / OOM / crash / host restart) leaving a dead port behind a stale
|
||||
// flag. `ensure` verifies the port is ACTUALLY answering and resurrects it if
|
||||
// not, but never bounces a healthy server (and the unique index +
|
||||
// `startDevServer` idempotency mean it can't duplicate one). So "open the
|
||||
// preview" and "click refresh" both reliably land on a clean, loaded app.
|
||||
//
|
||||
// The re-arm is a ref write (not setState), so the effect body stays free of
|
||||
// synchronous state updates; the only setState calls live in async callbacks.
|
||||
const lastEnsuredRefreshKeyRef = useRef(refreshKey);
|
||||
useEffect(() => {
|
||||
if (refreshKey === prevRefreshKeyRef.current) return;
|
||||
prevRefreshKeyRef.current = refreshKey;
|
||||
|
||||
// We only reset the ensure flag if we aren't currently waiting for a forced start.
|
||||
// If they hit refresh while it's already booting, don't break the state machine.
|
||||
if (!isForceStarting) {
|
||||
if (refreshKey !== lastEnsuredRefreshKeyRef.current && !isForceStarting) {
|
||||
lastEnsuredRefreshKeyRef.current = refreshKey;
|
||||
ensureCalledRef.current = false;
|
||||
}
|
||||
|
||||
if (ensureCalledRef.current) return;
|
||||
if (loading || !anatomy) return;
|
||||
ensureCalledRef.current = true;
|
||||
|
||||
fetch(`/api/projects/${projectId}/dev-server/ensure`, {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
})
|
||||
.then((r) => r.json())
|
||||
.then((data: { status?: string }) => {
|
||||
if (data.status === "no_history" || data.status === "no_container") {
|
||||
setEnsureStatus("no_history");
|
||||
} else if (data.status === "running") {
|
||||
// Verified live — keep showing the iframe.
|
||||
setEnsureStatus("running");
|
||||
} else if (data.status === "starting") {
|
||||
// Fresh start or resurrection of a dead server. Flip to warming-up and
|
||||
// force an immediate anatomy refetch: `ensure` has already marked any
|
||||
// stale/dead `running` row as stopped, so the refetch drops the
|
||||
// possibly-502 iframe and shows warming-up without waiting for the 5s
|
||||
// poll. The readiness probe then carries it to a clean load.
|
||||
setEnsureStatus("starting");
|
||||
reload();
|
||||
} else {
|
||||
setEnsureStatus("idle");
|
||||
}
|
||||
}, [refreshKey, isForceStarting]);
|
||||
})
|
||||
.catch(() => setEnsureStatus("error"));
|
||||
}, [loading, anatomy, projectId, refreshKey, isForceStarting, reload]);
|
||||
|
||||
useLayoutEffect(() => {
|
||||
if (!primaryRunning?.url) {
|
||||
|
||||
@@ -20,6 +20,7 @@ import {
|
||||
ensureDevContainer,
|
||||
startDevServer,
|
||||
probeDevServerReadiness,
|
||||
isDevServerListening,
|
||||
} from "@/lib/dev-container";
|
||||
|
||||
export async function POST(
|
||||
@@ -55,8 +56,8 @@ export async function POST(
|
||||
const projectSlug = (project.data?.slug as string) || project.id;
|
||||
const projectName = (project.data?.name as string) || "Project";
|
||||
|
||||
// 1. Is a dev server already running or starting on the primary port?
|
||||
const running = await queryOne<{
|
||||
// 1. Is a dev server already active on the primary port?
|
||||
const active = await queryOne<{
|
||||
id: string;
|
||||
state: string;
|
||||
preview_url: string;
|
||||
@@ -75,15 +76,39 @@ export async function POST(
|
||||
[projectId],
|
||||
);
|
||||
|
||||
if (running) {
|
||||
// A `starting` row is mid cold-boot; the readiness probe will promote it to
|
||||
// `running` once the port answers. Don't disturb it.
|
||||
if (active?.state === "starting") {
|
||||
return NextResponse.json({
|
||||
status: running.state === "running" ? "running" : "starting",
|
||||
previewUrl: running.preview_url,
|
||||
command: running.command,
|
||||
port: running.port,
|
||||
status: "starting",
|
||||
previewUrl: active.preview_url,
|
||||
command: active.command,
|
||||
port: active.port,
|
||||
});
|
||||
}
|
||||
|
||||
// A `running` row is only a record of intent. Verify the process is ACTUALLY
|
||||
// listening — it may have died from idle-stop / OOM / crash / host restart,
|
||||
// which is the #1 cause of "preview was up, now it's a 502". Only return
|
||||
// `running` if the port truly answers; otherwise fall through and resurrect.
|
||||
if (active?.state === "running") {
|
||||
const alive = await isDevServerListening(projectId, active.port);
|
||||
if (alive) {
|
||||
return NextResponse.json({
|
||||
status: "running",
|
||||
previewUrl: active.preview_url,
|
||||
command: active.command,
|
||||
port: active.port,
|
||||
});
|
||||
}
|
||||
// Dead behind a stale flag. Mark it stopped so the UI stops embedding the
|
||||
// 502 URL, then fall through to restart it with the same command below.
|
||||
await query(
|
||||
`UPDATE fs_dev_servers SET state = 'stopped', stopped_at = now() WHERE id = $1`,
|
||||
[active.id],
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Do we have a previous config to restart from?
|
||||
// (Limit to port 3000 since that's what the preview pane embeds)
|
||||
const last = await queryOne<{
|
||||
@@ -104,7 +129,13 @@ export async function POST(
|
||||
// If there's no history, we STILL want to auto-start! We just assume it's a standard
|
||||
// Next.js app on port 3000. Forcing the user to hit "Start Preview" on a new project
|
||||
// is unnecessary friction.
|
||||
const commandToRun = last?.command || "npx next dev -H 0.0.0.0 --webpack";
|
||||
//
|
||||
// Do NOT inject `--webpack`: that overrides the project's own bundler choice
|
||||
// (Next 16 defaults to Turbopack) and forced the dev server to disagree with
|
||||
// the project's `package.json` dev script. The default mirrors the script the
|
||||
// scaffolds actually ship (`next dev -H 0.0.0.0`); a real `last.command` from
|
||||
// a prior managed start always takes precedence anyway.
|
||||
const commandToRun = last?.command || "npx next dev -H 0.0.0.0";
|
||||
const portToRun = last?.port || 3000;
|
||||
const previewUrlToUse = last?.preview_url ?? null;
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ export const VIBN_DEV_IMAGE = process.env.VIBN_DEV_IMAGE ?? "vibn-dev:latest";
|
||||
|
||||
/** Resource caps per dev container. Tweak in env per-tier later. */
|
||||
const DEFAULT_CPU_LIMIT = process.env.VIBN_DEV_CPU_LIMIT ?? "1"; // 1 vCPU
|
||||
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "1g"; // 1 GiB
|
||||
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "2g"; // 2 GiB — a single Next dev (Turbopack) + npm install OOM-kills at 1 GiB
|
||||
const DEFAULT_DISK_LIMIT = process.env.VIBN_DEV_DISK_LIMIT ?? "10g"; // soft hint, not enforced by compose
|
||||
|
||||
// ── Schema ───────────────────────────────────────────────────────────
|
||||
@@ -186,7 +186,7 @@ function renderDevCompose(projectSlug: string, projectId: string): string {
|
||||
image: ${VIBN_DEV_IMAGE}
|
||||
pull_policy: never
|
||||
restart: unless-stopped
|
||||
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ]; then echo 'Found package.json, checking deps...'; if [ ! -d /workspace/node_modules ]; then npm install; fi; echo 'Starting dev server...'; npx next dev -H 0.0.0.0 --webpack; else echo 'No package.json found. Standing by...'; sleep infinity; fi"]
|
||||
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ] && [ ! -d /workspace/node_modules ]; then echo 'Installing root dependencies...'; npm install; fi; echo 'Container ready — dev server is managed externally via dev_server_start.'; sleep infinity"]
|
||||
working_dir: /workspace
|
||||
volumes:
|
||||
- workspace:/workspace
|
||||
@@ -336,28 +336,14 @@ export async function ensureDevContainer(
|
||||
],
|
||||
);
|
||||
|
||||
// In Path 2, the dev container natively runs the Next.js server on port 3000.
|
||||
// We automatically inject the static preview tracking row so the UI sees it instantly.
|
||||
const previewUrl = buildPreviewUrl(opts.projectId, opts.projectSlug, 3000);
|
||||
if (previewUrl) {
|
||||
await query(
|
||||
`INSERT INTO fs_dev_servers
|
||||
(id, project_id, workspace, name, command, port, preview_url, state)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET state = EXCLUDED.state`,
|
||||
[
|
||||
`ds_primary_${opts.projectId.replace(/-/g, "").slice(0, 10)}`,
|
||||
opts.projectId,
|
||||
opts.workspace.slug,
|
||||
"Primary App",
|
||||
"npx next dev -H 0.0.0.0 --webpack",
|
||||
3000,
|
||||
previewUrl,
|
||||
"running",
|
||||
],
|
||||
);
|
||||
}
|
||||
// NOTE: We deliberately do NOT seed a `state='running'` dev-server row here.
|
||||
// The container boots to standby (`sleep infinity`) and the dev server is
|
||||
// started lazily and exclusively by the managed flow (the preview pane's
|
||||
// auto-ensure or the AI's `dev_server_start`). Seeding a fake "running" row
|
||||
// pointed at a server that isn't actually listening produced 502s, and it
|
||||
// competed with the managed start for port 3000. `startDevServer` +
|
||||
// `probeDevServerReadiness` now own the row's lifecycle and only mark it
|
||||
// `running` once the port truly answers.
|
||||
|
||||
// Bookkeeping link so apps_list / projects_get see the dev container
|
||||
// under the right Vibn project.
|
||||
@@ -805,6 +791,41 @@ export function ensurePreviewListenAllInterfaces(command: string): string {
|
||||
return universalEnv + cmd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast one-shot liveness check: is *something* answering HTTP on `port` inside
|
||||
* the dev container right now? Any HTTP status (even 404/500) counts as alive;
|
||||
* only a refused/timed-out connection (curl yields `000`) means dead. Worst case
|
||||
* ~3s.
|
||||
*
|
||||
* This exists because a `state='running'` row in fs_dev_servers is only a record
|
||||
* of intent — the actual process can die out from under it (container idle-stop,
|
||||
* OOM-kill, crash, host restart) with nothing to update the row. Trusting the
|
||||
* flag blindly makes the preview embed a dead URL → 502. Callers use this to
|
||||
* verify-then-resurrect instead.
|
||||
*/
|
||||
export async function isDevServerListening(
|
||||
projectId: string,
|
||||
port: number,
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
const r = await execInDevContainer({
|
||||
projectId,
|
||||
command:
|
||||
`code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
|
||||
`"http://localhost:${port}/" 2>/dev/null || ` +
|
||||
`curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
|
||||
`"http://0.0.0.0:${port}/" 2>/dev/null || printf '000'); ` +
|
||||
`[ "$code" != "000" ] && [ -n "$code" ] && echo LIVE || echo DEAD`,
|
||||
timeoutMs: 8_000,
|
||||
});
|
||||
return /LIVE/.test(r.stdout);
|
||||
} catch {
|
||||
// Container itself is unreachable (down/provisioning). Report not-listening
|
||||
// so the caller takes the (re)start path rather than embedding a dead iframe.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll localhost inside the container until the dev server answers or time out.
|
||||
* Promotes `starting` → `running` / `failed` in fs_dev_servers. Intended to be
|
||||
|
||||
Reference in New Issue
Block a user