Files
vibn-agent-runner/vibn-frontend/lib/dev-container.ts
2026-06-15 11:37:29 -07:00

1343 lines
48 KiB
TypeScript

/**
* Per-project AI dev container ("vibn-dev").
*
* One Coolify Service per Vibn project, running the `vibn-dev` image.
* The AI agent drives it via:
* - shell.exec → docker exec into the container (via existing SSH path)
* - fs.* → file ops (implemented as `cat` / `tee` / `rm` etc.
* inside the container, on top of shell.exec)
* - dev_server.* → start long-running processes (week 2)
* - ship → git push to Gitea + trigger Coolify deploy (week 2)
*
* Lifecycle states:
* - Not provisioned → ensureDevContainer() creates the Coolify service
* - Suspended → Coolify-stopped (saves money). resume() starts it.
* - Running → docker exec works.
*
* Tenant safety: every helper takes a workspace and the caller must have
* already verified that the projectId belongs to that workspace via
* fs_projects. The exec primitive ALSO verifies the resolved container
* UUID is in the workspace's owned Coolify-project set, so a hijacked
* projectId can't reach unrelated containers.
*
* See: AI_PATH_B_EXECUTION_PLAN.md §3.
*/
import { query, queryOne } from "@/lib/db-postgres";
import {
createDockerComposeApp,
startService,
stopService,
getService,
} from "@/lib/coolify";
import { execInCoolifyApp, type ExecInAppResult } from "@/lib/coolify-exec";
import { isCoolifySshConfigured, runOnCoolifyHost } from "@/lib/coolify-ssh";
import { createHash } from "node:crypto";
import {
ensureProjectCoolifyProject,
getProjectCoolifyUuid,
linkResourceToProject,
} from "@/lib/projects";
import type { VibnWorkspace } from "@/lib/workspaces";
import { assertDevContainerQuota } from "@/lib/quotas";
import { sortDevPreviewsFrontendFirst } from "@/lib/dev-preview-priority";
// ── Configuration ────────────────────────────────────────────────────
/**
* Image tag for vibn-dev. Built and pushed from /vibn-dev/Dockerfile.
* Override per-environment with VIBN_DEV_IMAGE for staging/canary tags.
*/
export const VIBN_DEV_IMAGE = process.env.VIBN_DEV_IMAGE ?? "vibn-dev:latest";
/** Resource caps per dev container. Tweak in env per-tier later. */
const DEFAULT_CPU_LIMIT = process.env.VIBN_DEV_CPU_LIMIT ?? "1"; // 1 vCPU
const DEFAULT_MEM_LIMIT = process.env.VIBN_DEV_MEM_LIMIT ?? "2g"; // 2 GiB — a single Next dev (Turbopack) + npm install OOM-kills at 1 GiB
const DEFAULT_DISK_LIMIT = process.env.VIBN_DEV_DISK_LIMIT ?? "10g"; // soft hint, not enforced by compose
// ── Schema ───────────────────────────────────────────────────────────
let devContainersTableReady = false;
export async function ensureDevContainersTable(): Promise<void> {
if (devContainersTableReady) return;
await query(
`CREATE TABLE IF NOT EXISTS fs_project_dev_containers (
project_id TEXT PRIMARY KEY,
workspace TEXT NOT NULL,
service_uuid TEXT NOT NULL,
image TEXT NOT NULL,
state TEXT NOT NULL DEFAULT 'provisioning',
last_active_at TIMESTAMPTZ NOT NULL DEFAULT now(),
suspended_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS fs_project_dev_containers_ws_idx
ON fs_project_dev_containers (workspace);
CREATE INDEX IF NOT EXISTS fs_project_dev_containers_active_idx
ON fs_project_dev_containers (last_active_at);`,
[],
);
devContainersTableReady = true;
}
export interface DevContainerRow {
project_id: string;
workspace: string;
service_uuid: string;
image: string;
state: "provisioning" | "running" | "suspended" | "failed";
last_active_at: Date;
suspended_at: Date | null;
created_at: Date;
}
export async function getDevContainerRow(
projectId: string,
): Promise<DevContainerRow | null> {
await ensureDevContainersTable();
return queryOne<DevContainerRow>(
`SELECT * FROM fs_project_dev_containers WHERE project_id = $1 LIMIT 1`,
[projectId],
);
}
// ── Compose template ─────────────────────────────────────────────────
/**
* Render the docker-compose.yml that backs a single vibn-dev service.
*
* Two named volumes are intentional:
* - workspace : everything in /workspace (the user's source tree).
* Persists across suspends. Backed up to Gitea every
* 5 min via the auto-push autosave loop (week 2).
* - cache : language-toolchain caches (mise, npm, pip, cargo).
* Persists across suspends; per-project (never shared).
*
* The container has NO Vibn-internal network access. We rely on the
* default Coolify-bridge network being isolated from the vibn-postgres
* / vibn-frontend bridge. (Network policy hardening lands in week 1
* day 2 alongside the auto-push job.)
*/
/**
* Pre-allocated preview-port slots. We bake Traefik labels for
* ports 3000..3000+PREVIEW_PORT_COUNT-1 directly into the compose,
* so `dev_server.start` doesn't have to mutate the compose at runtime
* (which would require a Coolify redeploy and ~30s of latency).
*
* The first slot is the project's "primary" preview; additional slots
* cover the few-times-a-session case where the AI runs both a Vite
* frontend and a separate API. Cap is intentionally low (10) so a
* single user can't stand up dozens of public URLs.
*
* Subdomain shape: preview-{slot}-{projectSlug}-{token}.preview.vibnai.com
* - slot is 0..9, used to disambiguate when one project runs >1 server
* - token is a per-project random suffix written at compose-render
* time so URLs aren't enumerable across projects
*/
export const PREVIEW_BASE_PORT = 3000;
export const PREVIEW_PORT_COUNT = 10;
function projectPreviewToken(projectId: string): string {
// Stable per-project random — derived once and stored in the
// dev-container row so the same subdomains survive container
// restarts. We compute on first compose-render and persist below.
return Buffer.from(projectId).toString("hex").slice(0, 8);
}
// Deterministic, hard-to-guess secret per project, injected into the dev
// container so scaffolded NextAuth apps never throw `[auth][error] MissingSecret`
// in the PREVIEW. This is for dev/preview only — production auth uses the real
// AUTH_SECRET on the deployed Coolify app, never this. Deterministic so it
// survives container restarts without needing a DB column.
function devAuthSecret(projectId: string): string {
const salt = process.env.VIBN_DEV_AUTH_SALT ?? "vibn-dev-auth-v1";
return createHash("sha256").update(`${salt}:${projectId}`).digest("hex");
}
// Before (re)starting a dev container, clear any DEAD orphan container that is
// still holding this service's Coolify-assigned name. Coolify names every
// container of a resource with the resource uuid as a suffix (e.g.
// `vibn-dev-<uuid>`); a prior suspend/deploy can leave an exited container under
// that name, so the next start fails with "Conflict. The container name … is
// already in use" and Traefik loses its backend (the user sees a gateway
// timeout). We remove ONLY non-running containers (exited/created/dead) — never
// a live one — so a healthy container is never killed. Best-effort + SSH-gated.
async function reconcileDevContainerOrphans(
serviceUuid: string,
): Promise<void> {
if (!serviceUuid || !isCoolifySshConfigured()) return;
const nameFilter = `name=-${serviceUuid}$`;
const cmd =
`docker ps -a --filter '${nameFilter}' ` +
`--filter status=exited --filter status=created --filter status=dead -q ` +
`| xargs -r docker rm -f`;
await runOnCoolifyHost(cmd, { timeoutMs: 15_000 }).catch(() => {});
}
function renderDevCompose(projectSlug: string, projectId: string): string {
// Image distribution: we build vibn-dev on the Coolify host once
// (see /vibn-dev/setup-on-coolify.sh) and reference it locally.
// pull_policy: never tells Docker not to attempt a registry pull.
//
// Network isolation: vibn-dev sits on its OWN bridge network
// (`vibn-dev-net-${slug}`). On Coolify the Traefik proxy ALSO joins
// this network so it can reach the dev container; vibn-postgres /
// vibn-frontend do not.
//
// Traefik labels: pre-allocated routers for ports 3000..3009. Each
// router uses a distinct subdomain. Routes only "activate" when a
// process is actually listening on the port — Traefik does the
// health check.
const token = projectPreviewToken(projectId);
const traefikLabels: string[] = [
'"traefik.enable=true"',
'"traefik.docker.network=coolify"',
];
for (let i = 0; i < PREVIEW_PORT_COUNT; i++) {
const port = PREVIEW_BASE_PORT + i;
const router = `vibn-dev-${projectSlug}-${i}`;
const host = `preview-${i}-${projectSlug}-${token}.${PREVIEW_DOMAIN_BASE_RAW}`;
traefikLabels.push(
`"traefik.http.routers.${router}.rule=Host(\`${host}\`)"`,
);
traefikLabels.push(`"traefik.http.routers.${router}.entrypoints=https"`);
traefikLabels.push(`"traefik.http.routers.${router}.tls=true"`);
traefikLabels.push(
`"traefik.http.routers.${router}.tls.certresolver=letsencrypt-dns"`,
);
traefikLabels.push(
`"traefik.http.services.${router}.loadbalancer.server.port=${port}"`,
);
traefikLabels.push(`"traefik.http.routers.${router}.service=${router}"`);
}
const labelsBlock = traefikLabels.map((l) => ` - ${l}`).join("\n");
return `services:
vibn-dev:
image: ${VIBN_DEV_IMAGE}
pull_policy: never
restart: unless-stopped
command: ["bash", "-c", "echo 'Booting Vibn Container...'; if [ -f /workspace/package.json ] && [ ! -d /workspace/node_modules ]; then echo 'Installing root dependencies...'; npm install; fi; echo 'Container ready — dev server is managed externally via dev_server_start.'; sleep infinity"]
working_dir: /workspace
volumes:
- workspace:/workspace
- cache:/home/vibn/.cache
environment:
- VIBN_PROJECT_SLUG=${projectSlug}
- VIBN_PROJECT_ID=${projectId}
- VIBN_PREVIEW_TOKEN=${token}
- VIBN_DEV_CONTAINER=1
# Make scaffolded NextAuth apps work in the preview out of the box.
# AUTH_SECRET (NextAuth v5) / NEXTAUTH_SECRET (v4) prevent the
# "[auth][error] MissingSecret" crash; AUTH_TRUST_HOST lets v5 trust the
# Traefik-proxied preview host. Dev/preview only — prod uses its own secret.
- AUTH_SECRET=${devAuthSecret(projectId)}
- NEXTAUTH_SECRET=${devAuthSecret(projectId)}
- AUTH_TRUST_HOST=true
networks:
- vibn-dev-net
- coolify
labels:
${labelsBlock}
deploy:
resources:
limits:
cpus: '${DEFAULT_CPU_LIMIT}'
memory: ${DEFAULT_MEM_LIMIT}
networks:
vibn-dev-net:
name: vibn-dev-net-${projectSlug}
driver: bridge
coolify:
external: true
volumes:
workspace:
cache:
`;
}
const PREVIEW_DOMAIN_BASE_RAW =
process.env.VIBN_PREVIEW_DOMAIN_BASE ?? "preview.vibnai.com";
// ── Provisioning ─────────────────────────────────────────────────────
export interface EnsureDevContainerOpts {
projectId: string;
projectSlug: string;
projectName?: string;
workspace: VibnWorkspace;
/** Skip the initial start (provision-only). Default: start it. */
noStart?: boolean;
}
export interface EnsureDevContainerResult {
serviceUuid: string;
state: DevContainerRow["state"];
created: boolean;
}
/**
* Idempotently ensure a vibn-dev service exists for the given Vibn project.
*
* - Already provisioned → returns the row, optionally resumes if suspended.
* - Not provisioned → ensures the per-project Coolify Project exists,
* creates the docker-compose service, links the
* resource to the Vibn project, persists the row.
*
* Safe to call on every chat turn — first call is ~10s, subsequent
* calls are a single SELECT.
*/
export async function ensureDevContainer(
opts: EnsureDevContainerOpts,
): Promise<EnsureDevContainerResult> {
await ensureDevContainersTable();
const existing = await getDevContainerRow(opts.projectId);
if (existing) {
if (existing.state === "suspended" && !opts.noStart) {
// Resume counts as "starting one more" against the quota, since
// a suspended container is free but a running one isn't.
await assertDevContainerQuota(opts.workspace.slug);
await resumeDevContainer(opts.projectId);
return {
serviceUuid: existing.service_uuid,
state: "running",
created: false,
};
}
return {
serviceUuid: existing.service_uuid,
state: existing.state,
created: false,
};
}
const allowDevContainerWithoutSsh =
process.env.VIBN_ALLOW_DEV_CONTAINER_WITHOUT_SSH === "true";
if (!allowDevContainerWithoutSsh && !isCoolifySshConfigured()) {
throw new Error(
"Dev workspace unavailable: Coolify SSH is not configured on this server. " +
"Set COOLIFY_SSH_HOST and COOLIFY_SSH_PRIVATE_KEY_B64 (see lib/coolify-ssh.ts). " +
"Verify with GET /api/internal/infra-health using INFRA_HEALTH_SECRET. " +
"Local-only: set VIBN_ALLOW_DEV_CONTAINER_WITHOUT_SSH=true to skip this check.",
);
}
// Net-new container creation hits the quota (skip if noStart=true,
// since a never-started container costs nothing). The QuotaExceededError
// bubbles up to the MCP route which surfaces it as a 402 to the AI;
// the AI's recovery middleware can offer to suspend an idle one.
if (!opts.noStart) {
await assertDevContainerQuota(opts.workspace.slug);
}
// Need a Coolify project to land the service in.
let coolifyProjectUuid = await getProjectCoolifyUuid(
opts.projectId,
opts.workspace,
);
if (!coolifyProjectUuid) {
coolifyProjectUuid = await ensureProjectCoolifyProject(
opts.projectId,
opts.workspace,
{ projectSlug: opts.projectSlug, projectName: opts.projectName },
);
}
if (!coolifyProjectUuid) {
throw new Error(
`Could not provision Coolify project for ${opts.projectId}; dev container creation aborted.`,
);
}
const created = await createDockerComposeApp({
projectUuid: coolifyProjectUuid,
name: `vibn-dev-${opts.projectSlug}`,
description: `AI dev container for project ${opts.projectName ?? opts.projectSlug}`,
composeRaw: renderDevCompose(opts.projectSlug, opts.projectId),
instantDeploy: !opts.noStart,
});
await query(
`INSERT INTO fs_project_dev_containers
(project_id, workspace, service_uuid, image, state)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (project_id) DO UPDATE
SET service_uuid = EXCLUDED.service_uuid,
image = EXCLUDED.image,
state = EXCLUDED.state`,
[
opts.projectId,
opts.workspace.slug,
created.uuid,
VIBN_DEV_IMAGE,
opts.noStart ? "suspended" : "provisioning",
],
);
// NOTE: We deliberately do NOT seed a `state='running'` dev-server row here.
// The container boots to standby (`sleep infinity`) and the dev server is
// started lazily and exclusively by the managed flow (the preview pane's
// auto-ensure or the AI's `dev_server_start`). Seeding a fake "running" row
// pointed at a server that isn't actually listening produced 502s, and it
// competed with the managed start for port 3000. `startDevServer` +
// `probeDevServerReadiness` now own the row's lifecycle and only mark it
// `running` once the port truly answers.
// Bookkeeping link so apps_list / projects_get see the dev container
// under the right Vibn project.
try {
await linkResourceToProject(
opts.projectId,
opts.workspace.slug,
created.uuid,
"service",
);
} catch {
// best-effort
}
return { serviceUuid: created.uuid, state: "provisioning", created: true };
}
// ── Lifecycle ────────────────────────────────────────────────────────
export async function suspendDevContainer(projectId: string): Promise<void> {
const row = await getDevContainerRow(projectId);
if (!row) return;
if (row.state === "suspended") return;
await stopService(row.service_uuid);
await query(
`UPDATE fs_project_dev_containers
SET state = 'suspended', suspended_at = now()
WHERE project_id = $1`,
[projectId],
);
// Also mark ALL preview servers as stopped so the UI knows
await query(
`UPDATE fs_dev_servers SET state = 'stopped' WHERE project_id = $1 AND state != 'stopped'`,
[projectId],
).catch(() => {});
}
export async function resumeDevContainer(projectId: string): Promise<void> {
const row = await getDevContainerRow(projectId);
if (!row) return;
if (row.state === "running") return;
// Clear any dead orphan holding the container name so the start can't fail
// with a "container name already in use" conflict (which strands Traefik).
await reconcileDevContainerOrphans(row.service_uuid);
await startService(row.service_uuid);
await query(
`UPDATE fs_project_dev_containers
SET state = 'running', suspended_at = NULL, last_active_at = now()
WHERE project_id = $1`,
[projectId],
);
// Mark the last run preview server as starting again since we may need to boot it
await query(
`UPDATE fs_dev_servers SET state = 'starting' WHERE project_id = $1 AND state = 'stopped' AND id = (SELECT id FROM fs_dev_servers WHERE project_id = $1 ORDER BY started_at DESC LIMIT 1)`,
[projectId],
).catch(() => {});
}
async function touchActivity(projectId: string): Promise<void> {
// Also flips state 'provisioning' → 'running' on first successful exec.
// We can't rely on Coolify's deploy webhook alone (it fires before the
// container's actually accepting docker exec), so the first exec that
// returns is our authoritative liveness signal.
await query(
`UPDATE fs_project_dev_containers
SET last_active_at = now(),
state = CASE WHEN state IN ('provisioning','suspended') THEN 'running' ELSE state END,
suspended_at = NULL
WHERE project_id = $1`,
[projectId],
);
}
// ── Exec primitive ───────────────────────────────────────────────────
export interface DevContainerExecOpts {
projectId: string;
command: string;
cwd?: string; // defaults to /workspace
timeoutMs?: number;
maxBytes?: number;
/** Override the user (default: vibn). Use 'root' only when needed. */
user?: string;
/** Extra env vars (k=v lines prepended via `env` builtin). */
env?: Record<string, string>;
}
/**
* Run a command inside the project's vibn-dev service.
* Resumes the container if suspended, then docker-exec's via the
* existing SSH primitive. Stdout/stderr/exit-code returned synchronously.
*
* The caller is responsible for verifying the projectId belongs to the
* workspace BEFORE calling this. We re-verify the container UUID via
* the exec primitive's own resolution (it queries `docker ps --filter
* name={uuid}`), so a mismatched projectId can't reach foreign containers.
*/
export async function execInDevContainer(
opts: DevContainerExecOpts,
): Promise<ExecInAppResult> {
if (!isCoolifySshConfigured()) {
throw new Error(
"shell.exec requires SSH access to the Coolify host; configure COOLIFY_SSH_* envs.",
);
}
const row = await getDevContainerRow(opts.projectId);
if (!row) {
throw new Error(
`No dev container for project ${opts.projectId}. Call ensureDevContainer() first.`,
);
}
if (row.state === "suspended") {
await resumeDevContainer(opts.projectId);
}
// Self-healing migration hook: Migrate legacy nested repositories to root /workspace
try {
const projectRow = await queryOne<{ slug: string }>(
`SELECT slug FROM fs_projects WHERE id = $1 LIMIT 1`,
[opts.projectId],
);
if (projectRow?.slug) {
const slug = projectRow.slug;
const migrationCmd =
`if [ ! -f "/workspace/.vibn-migration-root-fix" ] && [ -d "/workspace/${slug}" ] && [ ! -d "/workspace/.git" ]; then ` +
`rsync -a "/workspace/${slug}/" "/workspace/" 2>/dev/null; ` +
`mv "/workspace/${slug}" "/workspace/.legacy-nested-${slug}-$(date +%s)" 2>/dev/null; ` +
`echo "Migrated nested repo from /workspace/${slug}" > /workspace/.vibn-migration-root-fix; ` +
`fi`;
await execInCoolifyApp({
appUuid: row.service_uuid,
service: "vibn-dev",
command: migrationCmd,
user: "vibn",
timeoutMs: 10000,
}).catch(() => null);
}
} catch (err) {
// non-fatal best effort
}
const cwd = opts.cwd && opts.cwd.trim() ? opts.cwd.trim() : "/workspace";
const envPrefix = opts.env
? Object.entries(opts.env)
.map(([k, v]) => `${shellEscape(k)}=${shellEscape(v)}`)
.join(" ")
: "";
const wrapped = envPrefix
? `cd ${shellEscape(cwd)} && env ${envPrefix} ${opts.command}`
: `cd ${shellEscape(cwd)} && ${opts.command}`;
const result = await execInCoolifyApp({
appUuid: row.service_uuid,
service: "vibn-dev",
command: wrapped,
user: opts.user ?? "vibn",
timeoutMs: opts.timeoutMs,
maxBytes: opts.maxBytes,
});
await touchActivity(opts.projectId);
return result;
}
function shellEscape(s: string): string {
return `'${s.replace(/'/g, `'\\''`)}'`;
}
// ── Health ───────────────────────────────────────────────────────────
/**
* Quick liveness check used by chat startup to decide whether to show
* a "spinning up your environment…" banner.
*/
export async function getDevContainerStatus(projectId: string): Promise<{
exists: boolean;
state: DevContainerRow["state"] | "absent";
serviceUuid: string | null;
/** Seconds since the row was created; useful for AI to decide whether to keep polling. */
ageSeconds?: number;
/** Set when state was just self-healed by this call. */
selfHealed?: boolean;
/** Set when state is stuck in provisioning past the grace window (likely failed). */
likelyFailed?: boolean;
/** Immediate blocker — no need to wait for provisioning timeout. */
blockedReason?: "coolify_ssh_not_configured" | "coolify_deploy_failed";
blockedHint?: string;
/** Coolify's own view of the service status (only populated when stuck). */
coolifyStatus?: string | null;
}> {
const row = await getDevContainerRow(projectId);
if (!row) return { exists: false, state: "absent", serviceUuid: null };
const ageMs = Date.now() - row.created_at.getTime();
const ageSeconds = Math.floor(ageMs / 1000);
// If we already think it's running or suspended, return as-is. The
// touchActivity() call inside execInDevContainer keeps the row honest.
if (row.state !== "provisioning") {
return {
exists: true,
state: row.state,
serviceUuid: row.service_uuid,
ageSeconds,
};
}
// State is 'provisioning'. The naive read-only return here used to
// create a deadlock: the AI polls status forever waiting for a flip
// that only happens via execInDevContainer. So instead, probe with
// a cheap `true` exec. If it succeeds, mark running and return.
// Coolify's service status alone isn't enough — Coolify reports
// 'running:unknown' for any service without a healthcheck/fqdn,
// which is every dev container. The exec is the source of truth.
if (!isCoolifySshConfigured()) {
return {
exists: true,
state: row.state,
serviceUuid: row.service_uuid,
ageSeconds,
likelyFailed: true,
blockedReason: "coolify_ssh_not_configured",
blockedHint:
"Server missing COOLIFY_SSH_HOST / COOLIFY_SSH_PRIVATE_KEY_B64 — docker exec cannot run. Configure on vibn-frontend; validate with GET /api/internal/infra-health (INFRA_HEALTH_SECRET).",
};
}
try {
const probe = await execInCoolifyApp({
appUuid: row.service_uuid,
service: "vibn-dev",
command: "true",
user: "vibn",
timeoutMs: 5_000,
});
if (probe.code === 0) {
await touchActivity(projectId);
return {
exists: true,
state: "running",
serviceUuid: row.service_uuid,
ageSeconds,
selfHealed: true,
};
}
} catch {
// Exec failed — container probably not yet up. Fall through
// to age-based likelyFailed heuristic.
}
// If we've been "provisioning" for >120s, the container is almost
// certainly stuck (image pull failure, scheduling failure, etc.).
// Surface that distinct from "still booting" so the AI can stop
// polling and tell the user instead of looping.
const likelyFailed = ageSeconds > 120;
let coolifyStatus: string | null = null;
let blockedReason: "coolify_deploy_failed" | undefined;
let blockedHint: string | undefined;
if (likelyFailed) {
// Pull the actual Coolify service status so the AI can see WHY
// the deployment is stuck (image pull error, build failure, etc.)
// instead of just knowing "it's been provisioning for a while."
try {
const svc = await getService(row.service_uuid).catch(() => null);
coolifyStatus = svc?.status ?? null;
if (coolifyStatus && /fail|error/i.test(coolifyStatus)) {
blockedReason = "coolify_deploy_failed";
blockedHint = `Coolify reports service status: "${coolifyStatus}". The dev container image may have failed to build or pull. Check the Coolify dashboard for this service (uuid: ${row.service_uuid}) or regenerate the project. Do NOT keep polling — this will not self-heal.`;
}
} catch {
// best-effort
}
}
return {
exists: true,
state: row.state,
serviceUuid: row.service_uuid,
ageSeconds,
likelyFailed,
blockedReason,
blockedHint,
coolifyStatus,
};
}
// Re-export getService so route handlers can pull live Coolify status
// without taking a separate dependency on lib/coolify.
export { getService };
// ── Dev servers ──────────────────────────────────────────────────────
//
// Long-running processes (Vite, Next dev, etc.) launched inside the
// dev container. We don't have a real supervisor; we shell out to
// `nohup`, redirect logs to /var/log/vibn-dev/<id>.log, and remember
// the PID + port in fs_dev_servers so subsequent calls can stop or
// list them.
//
// Preview URLs are exposed via Traefik's "host" router using the
// internal Coolify network (the dev container's primary bridge IP is
// reachable from Traefik). Full Traefik wildcard wiring lands in
// /vibn-dev/PREVIEWS.md and a separate Traefik config commit; this
// module just records the URL we WILL serve at, so the caller can
// hand it back to the chat.
let devServersTableReady = false;
async function ensureDevServersTable(): Promise<void> {
if (devServersTableReady) return;
await query(
`CREATE TABLE IF NOT EXISTS fs_dev_servers (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL REFERENCES fs_project_dev_containers(project_id) ON DELETE CASCADE,
workspace TEXT NOT NULL,
name TEXT NOT NULL,
command TEXT NOT NULL,
port INTEGER NOT NULL,
pid INTEGER,
preview_url TEXT NOT NULL,
state TEXT NOT NULL DEFAULT 'starting',
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
stopped_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS fs_dev_servers_project_idx ON fs_dev_servers (project_id, state);`,
[],
);
// Before we can add the partial unique index, collapse any pre-existing
// duplicate active rows on the same (project_id, port) down to the newest one.
// Older duplicates are marked 'stopped' so the index can be created cleanly.
await query(
`UPDATE fs_dev_servers d
SET state = 'stopped', stopped_at = COALESCE(stopped_at, now())
WHERE state IN ('starting','running')
AND EXISTS (
SELECT 1 FROM fs_dev_servers n
WHERE n.project_id = d.project_id
AND n.port = d.port
AND n.state IN ('starting','running')
AND (n.started_at > d.started_at
OR (n.started_at = d.started_at AND n.id > d.id))
)`,
[],
).catch(() => {});
// Physically forbid two active (starting/running) rows on the same port for a
// project. This is the hard backstop against the SELECT-then-INSERT race that
// produced duplicate "Port 3000" previews.
await query(
`CREATE UNIQUE INDEX IF NOT EXISTS fs_dev_servers_active_port_uq
ON fs_dev_servers (project_id, port)
WHERE state IN ('starting','running')`,
[],
).catch(() => {});
devServersTableReady = true;
}
export interface DevServerRow {
id: string;
project_id: string;
workspace: string;
name: string;
command: string;
port: number;
pid: number | null;
preview_url: string;
state: "starting" | "running" | "stopped" | "failed";
started_at: Date;
stopped_at: Date | null;
}
function randomToken(bytes = 4): string {
const buf = Buffer.alloc(bytes);
for (let i = 0; i < bytes; i++) buf[i] = Math.floor(Math.random() * 256);
return buf.toString("hex");
}
/**
* Map (projectSlug, port) → preview URL. Must match the Host() rules
* baked into the compose labels by renderDevCompose. Slot index is
* derived from `port - PREVIEW_BASE_PORT`.
*/
function buildPreviewUrl(
projectId: string,
projectSlug: string,
port: number,
): string | null {
const slot = port - PREVIEW_BASE_PORT;
if (slot < 0 || slot >= PREVIEW_PORT_COUNT) return null;
const token = projectPreviewToken(projectId);
return `https://preview-${slot}-${projectSlug}-${token}.${PREVIEW_DOMAIN_BASE_RAW}`;
}
export interface StartDevServerOpts {
projectId: string;
projectSlug: string;
command: string;
port: number;
name?: string;
workspace: VibnWorkspace;
}
export class PortBusyError extends Error {
constructor(
public readonly port: number,
public readonly listenerPid: number | null,
public readonly listenerCmd: string,
) {
super(
`Port ${port} is already in use by pid ${listenerPid ?? "?"} (${listenerCmd}). ` +
`Stop it first, or pick another port from ${PREVIEW_BASE_PORT}-${PREVIEW_BASE_PORT + PREVIEW_PORT_COUNT - 1}.`,
);
this.name = "PortBusyError";
}
}
export class PortOutOfRangeError extends Error {
constructor(public readonly port: number) {
super(
`Port ${port} is outside the preview slot range ${PREVIEW_BASE_PORT}-${PREVIEW_BASE_PORT + PREVIEW_PORT_COUNT - 1}. ` +
`Pick a port in that range so the preview URL is reachable through Traefik.`,
);
this.name = "PortOutOfRangeError";
}
}
/**
* Traefik reaches the dev container on the Docker `coolify` network. If the
* dev server binds loopback only, the proxy returns 502. Many CLIs ignore
* HOST= — Next.js needs `-H 0.0.0.0`; Vite honours VITE_DEV_SERVER_HOST.
*/
export function ensurePreviewListenAllInterfaces(command: string): string {
let cmd = command.trim();
if (!cmd) return cmd;
const universalEnv =
"export HOST=0.0.0.0 HOSTNAME=0.0.0.0 VITE_DEV_SERVER_HOST=0.0.0.0 WEBPACK_DEV_SERVER_HOST=0.0.0.0; ";
if (/\bnext\s+dev\b/.test(cmd) && !/\b(?:-H|--hostname)\b/.test(cmd)) {
cmd = cmd.replace(/\bnext\s+dev\b/, "next dev -H 0.0.0.0");
}
return universalEnv + cmd;
}
/**
* Fast one-shot liveness check: is *something* answering HTTP on `port` inside
* the dev container right now? Any HTTP status (even 404/500) counts as alive;
* only a refused/timed-out connection (curl yields `000`) means dead. Worst case
* ~3s.
*
* This exists because a `state='running'` row in fs_dev_servers is only a record
* of intent — the actual process can die out from under it (container idle-stop,
* OOM-kill, crash, host restart) with nothing to update the row. Trusting the
* flag blindly makes the preview embed a dead URL → 502. Callers use this to
* verify-then-resurrect instead.
*/
export async function isDevServerListening(
projectId: string,
port: number,
): Promise<boolean> {
try {
// CRITICAL: distinguish "port not listening" (truly dead) from "listening but
// slow to respond" (ALIVE — a Next.js/Vite dev server mid route-compile can
// take many seconds to answer `/`). We must NOT treat slowness as death:
// doing so made `ensure` restart a healthy-but-busy server on every refresh,
// and each restart cold-compiles, flickering the preview through
// 502 -> no-CSS -> broken-images -> perfect.
//
// We therefore key off curl's EXIT CODE, not response time. Exit 7
// (CURLE_COULDNT_CONNECT) is the only definitive "nothing is bound to this
// port" signal. A slow response yields exit 28 (timeout) / 52 / 56 etc., all
// of which mean the socket accepted us => the server is up. Dead only when
// BOTH localhost and 0.0.0.0 refuse the connection.
const r = await execInDevContainer({
projectId,
command:
`curl -s -o /dev/null --connect-timeout 2 --max-time 4 "http://localhost:${port}/" 2>/dev/null; a=$?; ` +
`curl -s -o /dev/null --connect-timeout 2 --max-time 4 "http://0.0.0.0:${port}/" 2>/dev/null; b=$?; ` +
`if [ "$a" = "7" ] && [ "$b" = "7" ]; then echo DEAD; else echo LIVE; fi`,
timeoutMs: 12_000,
});
return /LIVE/.test(r.stdout);
} catch {
// Container itself is unreachable (down/provisioning). Report not-listening
// so the caller takes the (re)start path rather than embedding a dead iframe.
return false;
}
}
/**
* Poll localhost inside the container until the dev server answers or time out.
* Promotes `starting` → `running` / `failed` in fs_dev_servers. Intended to be
* fired asynchronously after dev_server.start returns so MCP latency stays low.
*/
export async function probeDevServerReadiness(
projectId: string,
serverId: string,
port: number,
): Promise<void> {
await ensureDevServersTable();
// Up to ~300s: Next/Vite cold compile + potential npm installs can take a while.
// We accept any HTTP response (including 404/500) as "listening" — only
// connection failures stay 000 — because `curl -sf` falsely failed when `/`
// returned a dev error page before the app was fully ready.
const probeCmd =
`last_code=000; ` +
`for i in $(seq 1 300); do ` +
`for path in / ''; do ` +
`code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 ` +
`"http://localhost:${port}$path" 2>/dev/null || curl -sS -o /dev/null -w '%{http_code}' --max-time 2 --connect-timeout 2 "http://0.0.0.0:${port}$path" 2>/dev/null || printf '000'); ` +
`last_code=$code; ` +
`[ "$code" != "000" ] && [ -n "$code" ] && exit 0; ` +
`done; ` +
`sleep 1; done; ` +
`echo "PROBE_FAIL last_code=$last_code port=${port}"; ` +
`echo "PROBE_FAIL ps=$(ps aux | grep -E 'node|npm|next|vite' | grep -v grep | head -3 | tr '\\n' '|')"; ` +
`echo "PROBE_FAIL log_tail=$(tail -20 /var/log/vibn-dev/${serverId}.log 2>/dev/null | tr '\\n' '|' | head -c 2000)"; ` +
`exit 1`;
try {
const r = await execInDevContainer({
projectId,
command: probeCmd,
timeoutMs: 310_000,
});
if (r.code === 0) {
await query(
`UPDATE fs_dev_servers SET state = 'running' WHERE id = $1 AND project_id = $2 AND state != 'stopped'`,
[serverId, projectId],
);
} else {
console.error(
"[probe] FAILED",
JSON.stringify({
projectId,
serverId,
port,
exitCode: r.code,
stdout: (r.stdout || "").slice(0, 600),
}),
);
await query(
`UPDATE fs_dev_servers SET state = 'failed' WHERE id = $1 AND project_id = $2 AND state != 'stopped'`,
[serverId, projectId],
);
throw new Error(`Probe failed with exit code ${r.code}: ${r.stdout}`);
}
} catch (err) {
console.error(
"[probe] ERROR",
JSON.stringify({
projectId,
serverId,
port,
err: err instanceof Error ? err.message : String(err),
}),
);
await query(
`UPDATE fs_dev_servers SET state = 'failed' WHERE id = $1 AND project_id = $2 AND state != 'stopped'`,
[serverId, projectId],
);
throw err;
}
}
export async function startDevServer(
opts: StartDevServerOpts,
): Promise<DevServerRow> {
await ensureDevServersTable();
// 1. Validate slot range — outside this range we couldn't expose
// the preview through Traefik anyway (no router pre-allocated).
if (
opts.port < PREVIEW_BASE_PORT ||
opts.port >= PREVIEW_BASE_PORT + PREVIEW_PORT_COUNT
) {
throw new PortOutOfRangeError(opts.port);
}
// 2. Stop ALL tracked rows for this project on ALL preview ports.
// Because our socket reaper is infallible, the AI never needs to
// sprawl across multiple ports. We unconditionally reap and stop
// every active preview server for this project before starting a new one
// to keep the dashboard clean and prevent memory leaks.
const existingRows = await query<DevServerRow>(
`SELECT * FROM fs_dev_servers
WHERE project_id = $1 AND state IN ('starting','running','failed')`,
[opts.projectId],
);
// IDEMPOTENCY: If the exact same command is already starting or running on the same port,
// do not kill it! Just return the existing record. This prevents the AI from accidentally
// bouncing the server and dropping the cache after every file edit, which leads to 502s.
const alreadyRunning = existingRows.find(
(r) =>
r.port === opts.port &&
r.command === opts.command &&
(r.state === "starting" || r.state === "running"),
);
if (alreadyRunning) {
return alreadyRunning;
}
const killPortNodeCmd =
`node -e '` +
`const fs = require("fs"); ` +
`const portsToKill = [${existingRows
.map((r) => r.port)
.concat(opts.port)
.join(",")}]; ` +
`try { ` +
`const tcp = fs.readFileSync("/proc/net/tcp", "utf8"); ` +
`const inodes = []; ` +
`tcp.split("\\n").forEach(line => { ` +
`const parts = line.trim().split(/\\s+/); ` +
`if (parts.length > 9) { ` +
`const local = parts[1]; ` +
`for (const p of portsToKill) { ` +
`const hexPort = p.toString(16).toUpperCase().padStart(4, "0"); ` +
`if (local.endsWith(":" + hexPort)) { inodes.push(parts[9]); } ` +
`} ` +
`} ` +
`}); ` +
`if (inodes.length > 0) { ` +
`fs.readdirSync("/proc").forEach(file => { ` +
`if (/^\\d+$/.test(file)) { ` +
`try { ` +
`const fds = fs.readdirSync("/proc/" + file + "/fd"); ` +
`for (const fd of fds) { ` +
`const link = fs.readlinkSync("/proc/" + file + "/fd/" + fd); ` +
`for (const inode of inodes) { ` +
`if (link.includes("socket:[" + inode + "]")) { ` +
`process.kill(parseInt(file, 10), 9); ` +
`break; ` +
`} ` +
`} ` +
`} ` +
`} catch (e) {} ` +
`} ` +
`}); ` +
`} ` +
`} catch (e) { ` +
`try { require("child_process").execSync("fuser -k -9 " + portsToKill.join(",") + "/tcp 2>/dev/null || true"); } catch (err) {} ` +
`}'`;
for (const row of existingRows) {
if (row.pid) {
await execInDevContainer({
projectId: opts.projectId,
command: `kill -9 ${row.pid} 2>/dev/null || true`,
timeoutMs: 3_000,
}).catch(() => {});
}
await query(
`UPDATE fs_dev_servers SET state='stopped', stopped_at=now() WHERE id = $1`,
[row.id],
);
}
// 3. Force-kill ANY process currently listening on the port inside the container
// (including untracked orphans or processes from other runs).
await execInDevContainer({
projectId: opts.projectId,
command: killPortNodeCmd,
timeoutMs: 5_000,
}).catch(() => {});
// 3. Launch.
const id = `ds_${randomToken(6)}`;
const name = opts.name ?? `port-${opts.port}`;
const previewUrl =
buildPreviewUrl(opts.projectId, opts.projectSlug, opts.port) ??
`https://localhost-only:${opts.port}`;
const logFile = `/var/log/vibn-dev/${id}.log`;
const listenSafeCommand = ensurePreviewListenAllInterfaces(opts.command);
const secret = devAuthSecret(opts.projectId);
const launch =
`mkdir -p /var/log/vibn-dev && ` +
`cd /workspace && ` +
`nohup env PORT=${opts.port} VIBN_DEV_SERVER_ID=${id} ` +
`AUTH_SECRET=${secret} NEXTAUTH_SECRET=${secret} AUTH_TRUST_HOST=true ` +
`bash -lc ${shellEscape(listenSafeCommand)} > ${logFile} 2>&1 & ` +
`echo $!`;
const result = await execInDevContainer({
projectId: opts.projectId,
command: launch,
timeoutMs: 5_000,
});
const pid = parseInt(result.stdout.trim(), 10);
try {
await query(
`INSERT INTO fs_dev_servers
(id, project_id, workspace, name, command, port, pid, preview_url, state)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[
id,
opts.projectId,
opts.workspace.slug,
name,
opts.command,
opts.port,
Number.isFinite(pid) ? pid : null,
previewUrl,
"starting",
],
);
} catch (err) {
// The partial unique index (project_id, port WHERE state IN active) rejected
// this insert because a concurrent start already claimed the slot. That's the
// race we deliberately want the DB to arbitrate: just adopt the winning row
// instead of creating a duplicate "Port 3000".
const isUniqueViolation =
err instanceof Error &&
/duplicate key value|fs_dev_servers_active_port_uq|unique constraint/i.test(
err.message,
);
if (!isUniqueViolation) throw err;
const winner = await queryOne<DevServerRow>(
`SELECT * FROM fs_dev_servers
WHERE project_id = $1 AND port = $2 AND state IN ('starting','running')
ORDER BY started_at DESC LIMIT 1`,
[opts.projectId, opts.port],
);
if (winner) return winner;
// Extremely unlikely (winner vanished between insert + select): rethrow.
throw err;
}
return {
id,
project_id: opts.projectId,
workspace: opts.workspace.slug,
name,
command: opts.command,
port: opts.port,
pid: Number.isFinite(pid) ? pid : null,
preview_url: previewUrl,
state: "starting",
started_at: new Date(),
stopped_at: null,
};
}
export async function listDevServers(
projectId: string,
): Promise<DevServerRow[]> {
await ensureDevServersTable();
const rows = await query<DevServerRow>(
`SELECT * FROM fs_dev_servers WHERE project_id = $1 AND state != 'stopped'`,
[projectId],
);
return sortDevPreviewsFrontendFirst(rows);
}
export async function stopDevServer(
projectId: string,
id: string,
): Promise<void> {
await ensureDevServersTable();
const row = await queryOne<DevServerRow>(
`SELECT * FROM fs_dev_servers WHERE id = $1 AND project_id = $2 LIMIT 1`,
[id, projectId],
);
if (!row) throw new Error(`Dev server ${id} not found`);
if (row.pid) {
try {
await execInDevContainer({
projectId,
command: `kill ${row.pid} 2>/dev/null || true`,
timeoutMs: 3_000,
});
} catch {}
}
await query(
`UPDATE fs_dev_servers SET state = 'stopped', stopped_at = now() WHERE id = $1`,
[id],
);
}
export async function tailDevServerLog(
projectId: string,
id: string,
lines = 200,
): Promise<string> {
const r = await execInDevContainer({
projectId,
command: `tail -n ${Math.max(1, Math.min(2000, lines))} /var/log/vibn-dev/${id}.log 2>/dev/null || echo '(no log yet)'`,
timeoutMs: 5_000,
});
return r.stdout;
}
// ── Auto-push autosave ───────────────────────────────────────────────
//
// Treats Gitea as the canonical store; the container disk is ephemeral.
// On every chat turn (or every 5 min, whichever comes first) we push
// /workspace to a `vibn-autosave/main` branch in the project's repo.
//
// We don't try to be clever about what changed — just `git add -A &&
// git commit --allow-empty -m "autosave $(date)" && git push`. If the
// repo doesn't exist yet (fresh project, no `git init` done), we skip
// silently — the AI is responsible for `git init`+ first push when it
// scaffolds.
export interface AutosaveOpts {
projectId: string;
projectSlug: string;
workspace: VibnWorkspace;
/** Repo name in the workspace's Gitea org. Defaults to projectSlug. */
repo?: string;
/** Min interval between autosaves (default 5 min). */
minIntervalMs?: number;
}
export async function autosaveWorkspace(opts: AutosaveOpts): Promise<{
ran: boolean;
reason: string;
pushedAt?: Date;
}> {
const row = await getDevContainerRow(opts.projectId);
if (!row) return { ran: false, reason: "no dev container" };
if (row.state !== "running")
return { ran: false, reason: `state=${row.state}` };
// Throttle: don't autosave more than once per minIntervalMs.
const minInterval = opts.minIntervalMs ?? 5 * 60_000;
const last = await queryOne<{ pushed_at: Date }>(
`SELECT pushed_at FROM fs_dev_autosaves WHERE project_id = $1 ORDER BY pushed_at DESC LIMIT 1`,
[opts.projectId],
).catch(() => null);
if (last && Date.now() - new Date(last.pushed_at).getTime() < minInterval) {
return { ran: false, reason: "throttled" };
}
await ensureAutosavesTable();
// The git config + remote set-url is idempotent; PAT lives in the
// container's .netrc. Initial scaffold (init+add+commit+remote add)
// runs only when the repo doesn't have git yet.
const repo = opts.repo ?? opts.projectSlug;
const cmd = `set -e
cd /workspace/${opts.projectSlug}
if [ ! -d .git ]; then
echo '(no .git, skipping autosave)'
exit 0
fi
git config user.email vibn-bot@vibnai.com
git config user.name 'Vibn Autosave'
# Force push to the autosave branch — never collides with main.
git checkout -B vibn-autosave/main 2>&1 | tail -1
git add -A
if git diff --cached --quiet; then
echo '(no changes)'
else
git commit -m "autosave $(date -Is)" --quiet
fi
git push -f origin vibn-autosave/main 2>&1 | tail -3`;
try {
const r = await execInDevContainer({
projectId: opts.projectId,
command: cmd,
timeoutMs: 30_000,
});
await query(
`INSERT INTO fs_dev_autosaves (project_id, workspace, repo, output, code)
VALUES ($1, $2, $3, $4, $5)`,
[
opts.projectId,
opts.workspace.slug,
repo,
(r.stdout + r.stderr).slice(0, 4000),
r.code,
],
);
return { ran: true, reason: "pushed", pushedAt: new Date() };
} catch (err) {
return {
ran: false,
reason: err instanceof Error ? err.message : String(err),
};
}
}
let autosavesTableReady = false;
async function ensureAutosavesTable(): Promise<void> {
if (autosavesTableReady) return;
await query(
`CREATE TABLE IF NOT EXISTS fs_dev_autosaves (
id BIGSERIAL PRIMARY KEY,
project_id TEXT NOT NULL,
workspace TEXT NOT NULL,
repo TEXT NOT NULL,
output TEXT,
code INTEGER,
pushed_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS fs_dev_autosaves_project_idx ON fs_dev_autosaves (project_id, pushed_at DESC);`,
[],
);
autosavesTableReady = true;
}
// ── Idle suspend ─────────────────────────────────────────────────────
export interface IdleSweepResult {
scanned: number;
suspended: Array<{ projectId: string; idleMin: number }>;
errors: Array<{ projectId: string; error: string }>;
}
/**
* Suspend any running dev containers that haven't been touched in
* `idleMinutes` minutes. Intended for a once-per-5-min cron. Idempotent:
* re-running is a no-op for already-suspended containers.
*/
export async function suspendIdleContainers(
idleMinutes = 30,
): Promise<IdleSweepResult> {
await ensureDevContainersTable();
const cutoff = new Date(Date.now() - idleMinutes * 60_000);
const rows = await query<DevContainerRow>(
`SELECT * FROM fs_project_dev_containers
WHERE state = 'running' AND last_active_at < $1`,
[cutoff],
);
const result: IdleSweepResult = {
scanned: rows.length,
suspended: [],
errors: [],
};
for (const r of rows) {
try {
await suspendDevContainer(r.project_id);
const idleMin = Math.floor(
(Date.now() - new Date(r.last_active_at).getTime()) / 60_000,
);
result.suspended.push({ projectId: r.project_id, idleMin });
} catch (err) {
result.errors.push({
projectId: r.project_id,
error: err instanceof Error ? err.message : String(err),
});
}
}
return result;
}