From 62cb77b5a7b39465ff35a6fc33a0c8c78adc56b1 Mon Sep 17 00:00:00 2001 From: Mark Henderson Date: Thu, 23 Apr 2026 18:41:42 -0700 Subject: [PATCH] feat(mcp v2.4.1): apps.containers.{up,ps} + auto-fallback for queued-start Coolify's POST /services/{uuid}/start writes the rendered compose files but its Laravel queue worker routinely fails to actually invoke `docker compose up -d`. Until now agents had to SSH to recover. For an MVP that promises "tell vibn what app you want, get a URL", that's unacceptable. - lib/coolify-compose.ts: composeUp/composeDown/composePs over SSH via a one-shot docker:cli container that bind-mounts the rendered compose dir (works around vibn-logs being in docker group but not having read access to /data/coolify/services). - apps.create (template + composeRaw pathways) now uses ensureServiceUp which probes whether Coolify's queue actually spawned containers and falls back to direct docker compose up -d if not. Result includes startMethod for visibility. - apps.containers.up / apps.containers.ps exposed as MCP tools for recovery scenarios and post-env-change recreations. - Tenant safety: resolveAppOrService validates uuid against the caller's project before touching anything on the host. Made-with: Cursor --- app/api/mcp/route.ts | 211 +++++++++++++++++++++++++++++++++++++---- lib/coolify-compose.ts | 126 ++++++++++++++++++++++++ 2 files changed, 320 insertions(+), 17 deletions(-) create mode 100644 lib/coolify-compose.ts diff --git a/app/api/mcp/route.ts b/app/api/mcp/route.ts index 4029703..c4dea23 100644 --- a/app/api/mcp/route.ts +++ b/app/api/mcp/route.ts @@ -29,6 +29,7 @@ import { VIBN_GCS_LOCATION } from '@/lib/gcp/storage'; import { getApplicationRuntimeLogs } from '@/lib/coolify-logs'; import { execInCoolifyApp } from '@/lib/coolify-exec'; import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh'; +import { composeUp, composePs, type ResourceKind } from '@/lib/coolify-compose'; import { listContainersForApp } from '@/lib/coolify-containers'; import { deployApplication, @@ -85,7 +86,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com'; export async function GET() { return NextResponse.json({ name: 'vibn-mcp', - version: '2.4.0', + version: '2.4.1', authentication: { scheme: 'Bearer', tokenPrefix: 'vibn_sk_', @@ -115,6 +116,8 @@ export async function GET() { 'apps.exec', 'apps.volumes.list', 'apps.volumes.wipe', + 'apps.containers.up', + 'apps.containers.ps', 'apps.templates.list', 'apps.templates.search', 'apps.envs.list', @@ -217,6 +220,10 @@ export async function POST(request: Request) { return await toolAppsVolumesList(principal, params); case 'apps.volumes.wipe': return await toolAppsVolumesWipe(principal, params); + case 'apps.containers.up': + return await toolAppsContainersUp(principal, params); + case 'apps.containers.ps': + return await toolAppsContainersPs(principal, params); case 'apps.templates.list': return await toolAppsTemplatesList(params); case 'apps.templates.search': @@ -867,13 +874,10 @@ async function toolAppsCreate(principal: Principal, params: Record) } let started = false; + let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed'; + let startDiag = ''; if (params.instantDeploy !== false) { - try { - await startService(created.uuid); - started = true; - } catch (e) { - console.warn('[mcp apps.create/template] service start failed', e); - } + ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid)); } return NextResponse.json({ @@ -886,9 +890,11 @@ async function toolAppsCreate(principal: Principal, params: Record) template: templateSlug, urlsApplied, started, + startMethod, + ...(startDiag ? { startDiag } : {}), note: started - ? 'Service start was queued. First boot may take 1-5 min while Coolify pulls images and runs migrations. Use apps.logs to monitor.' - : 'Service created but not yet started. Call apps.deploy to start it.', + ? 'Containers are up. First boot may take 1-5 min while images finish pulling and migrations run. Use apps.logs to monitor.' + : 'Service created but containers did not start. Call apps.containers.up to retry, or apps.logs to diagnose.', }, }); } @@ -945,15 +951,11 @@ async function toolAppsCreate(principal: Principal, params: Record) } } - // Optionally start the service let started = false; + let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed'; + let startDiag = ''; if (params.instantDeploy !== false) { - try { - await startService(created.uuid); - started = true; - } catch (e) { - console.warn('[mcp apps.create/composeRaw] service start failed', e); - } + ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid)); } return NextResponse.json({ @@ -964,7 +966,9 @@ async function toolAppsCreate(principal: Principal, params: Record) url: `https://${fqdn}`, resourceType: 'service', started, - note: 'Domain routing for compose services must be configured in Coolify after initial startup — set SERVER_URL env to the desired URL.', + startMethod, + ...(startDiag ? { startDiag } : {}), + note: 'Domain routing for compose services must be configured after initial startup — set SERVER_URL env to the desired URL, then call apps.containers.up to apply.', }, }); } @@ -1036,6 +1040,113 @@ async function toolAppsCreate(principal: Principal, params: Record) }); } +// ────────────────────────────────────────────────── +// apps.containers.* — direct lifecycle for compose stacks +// ────────────────────────────────────────────────── +// +// These bypass Coolify's queued-start worker (which is unreliable for +// compose Services) and run `docker compose up -d` / `ps` against the +// rendered compose dir on the Coolify host. Used as the recovery +// path when Coolify's start API returns "queued" but no containers +// materialise. +// +// Tenant safety: the uuid is resolved via getApplicationInProject / +// getServiceInProject, so a workspace can't drive containers it +// doesn't own. + +/** Resolve a uuid to either an Application or a compose Service in the + * caller's project. Returns the canonical resource kind for + * coolify-compose helpers. NextResponse on policy error / not found. */ +async function resolveAppOrService( + principal: Principal, + uuid: string, +): Promise<{ uuid: string; kind: ResourceKind } | NextResponse> { + const projectUuid = requireCoolifyProject(principal); + if (projectUuid instanceof NextResponse) return projectUuid; + try { + await getApplicationInProject(uuid, projectUuid); + return { uuid, kind: 'application' }; + } catch (e) { + if (!(e instanceof Error && /404|not found/i.test(e.message))) { + // Tenant errors and other unexpected ones — surface them + if (e instanceof TenantError) return NextResponse.json({ error: e.message }, { status: 403 }); + throw e; + } + } + try { + await getServiceInProject(uuid, projectUuid); + return { uuid, kind: 'service' }; + } catch (e) { + if (e instanceof TenantError) { + return NextResponse.json({ error: e.message }, { status: 403 }); + } + return NextResponse.json({ error: `App or service ${uuid} not found in this workspace` }, { status: 404 }); + } +} + +/** + * apps.containers.up — `docker compose up -d` against the rendered + * compose dir on the Coolify host. + * + * Use when Coolify's queued-start left the stack in "Created" or + * "no containers" state, or after editing env vars / domains to + * apply the changes (compose env file is regenerated; containers + * need to be recreated to pick it up). + * + * Idempotent — already-running containers are no-op'd. Returns + * `{ ok, code, stdout, stderr, durationMs }` so agents can show the + * user what happened. + */ +async function toolAppsContainersUp(principal: Principal, params: Record) { + const uuid = String(params.uuid ?? params.appUuid ?? '').trim(); + if (!uuid) return NextResponse.json({ error: 'Param "uuid" is required' }, { status: 400 }); + if (!isCoolifySshConfigured()) { + return NextResponse.json({ error: 'apps.containers.up requires SSH to the Coolify host' }, { status: 501 }); + } + const resolved = await resolveAppOrService(principal, uuid); + if (resolved instanceof NextResponse) return resolved; + + const t0 = Date.now(); + const r = await composeUp(resolved.kind, resolved.uuid, { timeoutMs: 600_000 }); + return NextResponse.json({ + result: { + ok: r.code === 0, + code: r.code, + stdout: r.stdout.slice(-4000), + stderr: r.stderr.slice(-4000), + truncated: r.truncated, + durationMs: Date.now() - t0, + }, + }); +} + +/** + * apps.containers.ps — `docker compose ps -a` for diagnostics. + * + * Returns a one-line-per-container summary including names, image, + * state, and exit codes. Use to check whether containers are stuck + * in `Created` (Coolify queued-start failure) vs `Exited` (app crash) + * vs `Restarting` (boot loop). + */ +async function toolAppsContainersPs(principal: Principal, params: Record) { + const uuid = String(params.uuid ?? params.appUuid ?? '').trim(); + if (!uuid) return NextResponse.json({ error: 'Param "uuid" is required' }, { status: 400 }); + if (!isCoolifySshConfigured()) { + return NextResponse.json({ error: 'apps.containers.ps requires SSH to the Coolify host' }, { status: 501 }); + } + const resolved = await resolveAppOrService(principal, uuid); + if (resolved instanceof NextResponse) return resolved; + + const r = await composePs(resolved.kind, resolved.uuid); + return NextResponse.json({ + result: { + ok: r.code === 0, + stdout: r.stdout.slice(-4000), + stderr: r.stderr.slice(-2000), + }, + }); +} + // ────────────────────────────────────────────────── // apps.templates.* — Coolify one-click catalog browse // ────────────────────────────────────────────────── @@ -1106,6 +1217,72 @@ async function toolAppsTemplatesSearch(params: Record) { return NextResponse.json({ result: { items } }); } +/** + * Ensure a Coolify Service is actually running (containers exist and + * are healthy/starting), with a fallback path for Coolify's flaky + * queued-start worker. + * + * Strategy: + * 1. Call POST /services/{uuid}/start so Coolify's records show + * "starting" and any internal hooks fire. + * 2. Wait briefly, then probe the host for any container belonging + * to this service via `docker ps --filter name={uuid}`. + * 3. If no containers materialised, run `docker compose up -d` + * directly via SSH against the rendered compose dir. This is + * the same command Coolify's worker would run; we just bypass + * the unreliable queue. + * + * Returns: + * started true if at least one container is running for this service + * startMethod which path got us there + * diag human-readable note for failures (truncated stderr) + */ +async function ensureServiceUp(uuid: string): Promise<{ + started: boolean; + startMethod: 'coolify-queue' | 'compose-up' | 'failed'; + diag: string; +}> { + // 1. Ask Coolify nicely + try { + await startService(uuid); + } catch (e) { + console.warn('[ensureServiceUp] startService failed (will fall back)', e); + } + + // 2. Probe — has the queue actually started anything? + if (!isCoolifySshConfigured()) { + return { started: true, startMethod: 'coolify-queue', diag: '' }; + } + // Allow up to ~12s for the worker to wake up; checking every 3s. + for (let i = 0; i < 4; i++) { + await new Promise(r => setTimeout(r, 3_000)); + try { + const probe = await runOnCoolifyHost( + `docker ps --filter name=${uuid} --format '{{.Names}}'`, + { timeoutMs: 8_000 }, + ); + if (probe.stdout.trim().length > 0) { + return { started: true, startMethod: 'coolify-queue', diag: '' }; + } + } catch (e) { + console.warn('[ensureServiceUp] probe failed', e); + } + } + + // 3. Fallback — run docker compose up -d ourselves + try { + const r = await composeUp('service', uuid, { timeoutMs: 600_000 }); + if (r.code === 0) { + return { started: true, startMethod: 'compose-up', diag: '' }; + } + // Non-zero exit but compose ran — capture the tail for diagnosis + const tail = (r.stderr || r.stdout).trim().slice(-400); + return { started: false, startMethod: 'failed', diag: tail }; + } catch (e) { + return { started: false, startMethod: 'failed', diag: e instanceof Error ? e.message : String(e) }; + } +} + /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */ function resolveFqdn(domainParam: unknown, slug: string, appName: string): string | NextResponse { const fqdn = String(domainParam ?? '').trim() diff --git a/lib/coolify-compose.ts b/lib/coolify-compose.ts new file mode 100644 index 0000000..ced12c9 --- /dev/null +++ b/lib/coolify-compose.ts @@ -0,0 +1,126 @@ +/** + * Bring a Coolify Service or compose Application up via raw + * `docker compose up -d`. + * + * Why this exists + * --------------- + * Coolify's `POST /services/{uuid}/start` and `POST /deploy` endpoints + * write the rendered docker-compose.yml + .env to + * `/data/coolify/services/{uuid}/` (or `applications/{uuid}/` for + * compose apps), then enqueue a Laravel job to run + * `docker compose up -d`. In practice that worker queue is unreliable: + * it routinely returns "Service starting request queued" and then + * never actually invokes docker compose. The user's stack just sits + * there with rendered files and no containers. + * + * For a hands-off SaaS we can't ship that experience. This helper + * does the work directly via SSH, so a single MCP `apps.create` call + * really does leave a running app. + * + * Permissions model + * ----------------- + * The `vibn-logs` SSH user (created by deploy/setup-coolify-ssh.sh) + * is in the `docker` group but has no shell sudo. It also can't read + * `/data/coolify/services/` directly because Coolify chmods that to + * 700 root. We work around both constraints by running the docker + * CLI inside a one-shot container that bind-mounts the path. The + * docker daemon runs as root so it can read the directory; the + * `vibn-logs` user only needs `docker` socket access. + */ + +import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh'; + +/** Slug for the Coolify-managed compose dir. */ +export type ResourceKind = 'service' | 'application'; + +function composeDir(kind: ResourceKind, uuid: string): string { + // Coolify v4 path layout — these are stable across the v4 line. + return kind === 'service' + ? `/data/coolify/services/${uuid}` + : `/data/coolify/applications/${uuid}`; +} + +/** Shell-quote a single argument as a POSIX single-quoted string. */ +function sq(s: string): string { + return `'${String(s).replace(/'/g, `'\\''`)}'`; +} + +/** + * Run a `docker compose` subcommand inside the rendered compose + * directory using a one-shot `docker:cli` container. Falls back to + * pulling the image on the first call. + * + * The `docker:cli` image (~50MB) is the official Docker CLI without + * the daemon. By bind-mounting the host docker socket it talks to + * the host's daemon, so containers it creates are first-class + * children of the same Docker engine — exactly what we want. + */ +async function composeRun( + kind: ResourceKind, + uuid: string, + args: string[], + opts: { timeoutMs?: number } = {}, +): Promise { + const dir = composeDir(kind, uuid); + // Use --workdir + bind-mount so docker compose finds compose.yml + .env + // automatically. The `--rm` cleans the helper container after each call. + const cmd = [ + 'docker', 'run', '--rm', + '-v', sq(`${dir}:/work`), + '-w', '/work', + '-v', '/var/run/docker.sock:/var/run/docker.sock', + '--network', 'host', + 'docker:cli', + 'compose', ...args.map(sq), + ].join(' '); + return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 }); +} + +/** + * `docker compose up -d` for a Coolify service or compose app. + * + * Idempotent — Compose already-running containers are no-op'd. + * Returns the raw SSH result so callers can surface diagnostics on + * failure (most common: image-pull errors, port conflicts). + */ +export async function composeUp( + kind: ResourceKind, + uuid: string, + opts: { timeoutMs?: number } = {}, +): Promise { + return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts); +} + +/** `docker compose down` — stops + removes containers; volumes preserved. */ +export async function composeDown( + kind: ResourceKind, + uuid: string, + opts: { timeoutMs?: number } = {}, +): Promise { + return composeRun(kind, uuid, ['down'], opts); +} + +/** `docker compose ps -a` — useful for diagnosing why up didn't yield healthy containers. */ +export async function composePs( + kind: ResourceKind, + uuid: string, +): Promise { + return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 }); +} + +/** + * Verify the rendered compose dir exists before trying to run docker + * compose against it. Returns a friendly null-on-missing instead of + * an opaque ENOENT. + */ +export async function composeDirExists( + kind: ResourceKind, + uuid: string, +): Promise { + // We can't `ls` the dir directly (perm denied), but a docker bind-mount + // probe will fail-closed if the path is missing. + const dir = composeDir(kind, uuid); + const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`; + const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 }); + return r.stdout.trim().endsWith('OK'); +}