fix(preview): self-healing dev server so the preview always loads cleanly
The dominant production failure was a dead dev-server process behind a 'running' DB flag (idle-stop / OOM / crash / host restart), which the UI trusted and embedded -> permanent 502 until a manual restart. - dev-container.ts: add isDevServerListening() fast liveness probe; stop the container entrypoint from auto-running 'npx next dev --webpack' (it competed with the managed server, forced the wrong bundler/cwd, and doubled memory); drop the fake state='running' seed row; bump dev container memory 1g -> 2g. - ensure route: verify a 'running' row is ACTUALLY listening and resurrect it if dead, instead of trusting the flag; never bounce a healthy server. - preview page: call ensure on every mount and on refresh (verify + heal), force an immediate anatomy refetch on (re)start so a dead frame swaps to 'warming up' without the 5s lag. Backstopped by the partial unique index + startDevServer idempotency, so heals can never duplicate or thrash a server.
This commit is contained in:
@@ -51,7 +51,7 @@ export const VIBN_DEV_IMAGE = process.env.VIBN_DEV_IMAGE ?? "vibn-dev:latest";
|
||||
|
||||
/** Resource caps per dev container. Tweak in env per-tier later. */
|
||||
const DEFAULT_CPU_LIMIT = process.env.VIBN_DEV_CPU_LIMIT ?? "1"; // 1 vCPU
|
||||
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "1g"; // 1 GiB
|
||||
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "2g"; // 2 GiB — a single Next dev (Turbopack) + npm install OOM-kills at 1 GiB
|
||||
const DEFAULT_DISK_LIMIT = process.env.VIBN_DEV_DISK_LIMIT ?? "10g"; // soft hint, not enforced by compose
|
||||
|
||||
// ── Schema ───────────────────────────────────────────────────────────
|
||||
@@ -186,7 +186,7 @@ function renderDevCompose(projectSlug: string, projectId: string): string {
|
||||
image: ${VIBN_DEV_IMAGE}
|
||||
pull_policy: never
|
||||
restart: unless-stopped
|
||||
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ]; then echo 'Found package.json, checking deps...'; if [ ! -d /workspace/node_modules ]; then npm install; fi; echo 'Starting dev server...'; npx next dev -H 0.0.0.0 --webpack; else echo 'No package.json found. Standing by...'; sleep infinity; fi"]
|
||||
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ] && [ ! -d /workspace/node_modules ]; then echo 'Installing root dependencies...'; npm install; fi; echo 'Container ready — dev server is managed externally via dev_server_start.'; sleep infinity"]
|
||||
working_dir: /workspace
|
||||
volumes:
|
||||
- workspace:/workspace
|
||||
@@ -336,28 +336,14 @@ export async function ensureDevContainer(
|
||||
],
|
||||
);
|
||||
|
||||
// In Path 2, the dev container natively runs the Next.js server on port 3000.
|
||||
// We automatically inject the static preview tracking row so the UI sees it instantly.
|
||||
const previewUrl = buildPreviewUrl(opts.projectId, opts.projectSlug, 3000);
|
||||
if (previewUrl) {
|
||||
await query(
|
||||
`INSERT INTO fs_dev_servers
|
||||
(id, project_id, workspace, name, command, port, preview_url, state)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET state = EXCLUDED.state`,
|
||||
[
|
||||
`ds_primary_${opts.projectId.replace(/-/g, "").slice(0, 10)}`,
|
||||
opts.projectId,
|
||||
opts.workspace.slug,
|
||||
"Primary App",
|
||||
"npx next dev -H 0.0.0.0 --webpack",
|
||||
3000,
|
||||
previewUrl,
|
||||
"running",
|
||||
],
|
||||
);
|
||||
}
|
||||
// NOTE: We deliberately do NOT seed a `state='running'` dev-server row here.
|
||||
// The container boots to standby (`sleep infinity`) and the dev server is
|
||||
// started lazily and exclusively by the managed flow (the preview pane's
|
||||
// auto-ensure or the AI's `dev_server_start`). Seeding a fake "running" row
|
||||
// pointed at a server that isn't actually listening produced 502s, and it
|
||||
// competed with the managed start for port 3000. `startDevServer` +
|
||||
// `probeDevServerReadiness` now own the row's lifecycle and only mark it
|
||||
// `running` once the port truly answers.
|
||||
|
||||
// Bookkeeping link so apps_list / projects_get see the dev container
|
||||
// under the right Vibn project.
|
||||
@@ -805,6 +791,41 @@ export function ensurePreviewListenAllInterfaces(command: string): string {
|
||||
return universalEnv + cmd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast one-shot liveness check: is *something* answering HTTP on `port` inside
|
||||
* the dev container right now? Any HTTP status (even 404/500) counts as alive;
|
||||
* only a refused/timed-out connection (curl yields `000`) means dead. Worst case
|
||||
* ~3s.
|
||||
*
|
||||
* This exists because a `state='running'` row in fs_dev_servers is only a record
|
||||
* of intent — the actual process can die out from under it (container idle-stop,
|
||||
* OOM-kill, crash, host restart) with nothing to update the row. Trusting the
|
||||
* flag blindly makes the preview embed a dead URL → 502. Callers use this to
|
||||
* verify-then-resurrect instead.
|
||||
*/
|
||||
export async function isDevServerListening(
|
||||
projectId: string,
|
||||
port: number,
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
const r = await execInDevContainer({
|
||||
projectId,
|
||||
command:
|
||||
`code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
|
||||
`"http://localhost:${port}/" 2>/dev/null || ` +
|
||||
`curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
|
||||
`"http://0.0.0.0:${port}/" 2>/dev/null || printf '000'); ` +
|
||||
`[ "$code" != "000" ] && [ -n "$code" ] && echo LIVE || echo DEAD`,
|
||||
timeoutMs: 8_000,
|
||||
});
|
||||
return /LIVE/.test(r.stdout);
|
||||
} catch {
|
||||
// Container itself is unreachable (down/provisioning). Report not-listening
|
||||
// so the caller takes the (re)start path rather than embedding a dead iframe.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll localhost inside the container until the dev server answers or time out.
|
||||
* Promotes `starting` → `running` / `failed` in fs_dev_servers. Intended to be
|
||||
|
||||
Reference in New Issue
Block a user