feat(mcp v2.4.1): apps.containers.{up,ps} + auto-fallback for queued-start

Coolify's POST /services/{uuid}/start writes the rendered compose
files but its Laravel queue worker routinely fails to actually
invoke `docker compose up -d`. Until now agents had to SSH to
recover. For an MVP that promises "tell vibn what app you want,
get a URL", that's unacceptable.

- lib/coolify-compose.ts: composeUp/composeDown/composePs over SSH
  via a one-shot docker:cli container that bind-mounts the rendered
  compose dir (works around vibn-logs being in docker group but not
  having read access to /data/coolify/services).
- apps.create (template + composeRaw pathways) now uses
  ensureServiceUp which probes whether Coolify's queue actually
  spawned containers and falls back to direct docker compose up -d
  if not. Result includes startMethod for visibility.
- apps.containers.up / apps.containers.ps exposed as MCP tools for
  recovery scenarios and post-env-change recreations.
- Tenant safety: resolveAppOrService validates uuid against the
  caller's project before touching anything on the host.

Made-with: Cursor
This commit is contained in:
2026-04-23 18:41:42 -07:00
parent e453e780cc
commit 62cb77b5a7
2 changed files with 320 additions and 17 deletions

View File

@@ -29,6 +29,7 @@ import { VIBN_GCS_LOCATION } from '@/lib/gcp/storage';
import { getApplicationRuntimeLogs } from '@/lib/coolify-logs'; import { getApplicationRuntimeLogs } from '@/lib/coolify-logs';
import { execInCoolifyApp } from '@/lib/coolify-exec'; import { execInCoolifyApp } from '@/lib/coolify-exec';
import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh'; import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh';
import { composeUp, composePs, type ResourceKind } from '@/lib/coolify-compose';
import { listContainersForApp } from '@/lib/coolify-containers'; import { listContainersForApp } from '@/lib/coolify-containers';
import { import {
deployApplication, deployApplication,
@@ -85,7 +86,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
export async function GET() { export async function GET() {
return NextResponse.json({ return NextResponse.json({
name: 'vibn-mcp', name: 'vibn-mcp',
version: '2.4.0', version: '2.4.1',
authentication: { authentication: {
scheme: 'Bearer', scheme: 'Bearer',
tokenPrefix: 'vibn_sk_', tokenPrefix: 'vibn_sk_',
@@ -115,6 +116,8 @@ export async function GET() {
'apps.exec', 'apps.exec',
'apps.volumes.list', 'apps.volumes.list',
'apps.volumes.wipe', 'apps.volumes.wipe',
'apps.containers.up',
'apps.containers.ps',
'apps.templates.list', 'apps.templates.list',
'apps.templates.search', 'apps.templates.search',
'apps.envs.list', 'apps.envs.list',
@@ -217,6 +220,10 @@ export async function POST(request: Request) {
return await toolAppsVolumesList(principal, params); return await toolAppsVolumesList(principal, params);
case 'apps.volumes.wipe': case 'apps.volumes.wipe':
return await toolAppsVolumesWipe(principal, params); return await toolAppsVolumesWipe(principal, params);
case 'apps.containers.up':
return await toolAppsContainersUp(principal, params);
case 'apps.containers.ps':
return await toolAppsContainersPs(principal, params);
case 'apps.templates.list': case 'apps.templates.list':
return await toolAppsTemplatesList(params); return await toolAppsTemplatesList(params);
case 'apps.templates.search': case 'apps.templates.search':
@@ -867,13 +874,10 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
} }
let started = false; let started = false;
let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
let startDiag = '';
if (params.instantDeploy !== false) { if (params.instantDeploy !== false) {
try { ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
await startService(created.uuid);
started = true;
} catch (e) {
console.warn('[mcp apps.create/template] service start failed', e);
}
} }
return NextResponse.json({ return NextResponse.json({
@@ -886,9 +890,11 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
template: templateSlug, template: templateSlug,
urlsApplied, urlsApplied,
started, started,
startMethod,
...(startDiag ? { startDiag } : {}),
note: started note: started
? 'Service start was queued. First boot may take 1-5 min while Coolify pulls images and runs migrations. Use apps.logs to monitor.' ? 'Containers are up. First boot may take 1-5 min while images finish pulling and migrations run. Use apps.logs to monitor.'
: 'Service created but not yet started. Call apps.deploy to start it.', : 'Service created but containers did not start. Call apps.containers.up to retry, or apps.logs to diagnose.',
}, },
}); });
} }
@@ -945,15 +951,11 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
} }
} }
// Optionally start the service
let started = false; let started = false;
let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
let startDiag = '';
if (params.instantDeploy !== false) { if (params.instantDeploy !== false) {
try { ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
await startService(created.uuid);
started = true;
} catch (e) {
console.warn('[mcp apps.create/composeRaw] service start failed', e);
}
} }
return NextResponse.json({ return NextResponse.json({
@@ -964,7 +966,9 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
url: `https://${fqdn}`, url: `https://${fqdn}`,
resourceType: 'service', resourceType: 'service',
started, started,
note: 'Domain routing for compose services must be configured in Coolify after initial startup — set SERVER_URL env to the desired URL.', startMethod,
...(startDiag ? { startDiag } : {}),
note: 'Domain routing for compose services must be configured after initial startup — set SERVER_URL env to the desired URL, then call apps.containers.up to apply.',
}, },
}); });
} }
@@ -1036,6 +1040,113 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
}); });
} }
// ──────────────────────────────────────────────────
// apps.containers.* — direct lifecycle for compose stacks
// ──────────────────────────────────────────────────
//
// These bypass Coolify's queued-start worker (which is unreliable for
// compose Services) and run `docker compose up -d` / `ps` against the
// rendered compose dir on the Coolify host. Used as the recovery
// path when Coolify's start API returns "queued" but no containers
// materialise.
//
// Tenant safety: the uuid is resolved via getApplicationInProject /
// getServiceInProject, so a workspace can't drive containers it
// doesn't own.
/** Resolve a uuid to either an Application or a compose Service in the
* caller's project. Returns the canonical resource kind for
* coolify-compose helpers. NextResponse on policy error / not found. */
async function resolveAppOrService(
principal: Principal,
uuid: string,
): Promise<{ uuid: string; kind: ResourceKind } | NextResponse> {
const projectUuid = requireCoolifyProject(principal);
if (projectUuid instanceof NextResponse) return projectUuid;
try {
await getApplicationInProject(uuid, projectUuid);
return { uuid, kind: 'application' };
} catch (e) {
if (!(e instanceof Error && /404|not found/i.test(e.message))) {
// Tenant errors and other unexpected ones — surface them
if (e instanceof TenantError) return NextResponse.json({ error: e.message }, { status: 403 });
throw e;
}
}
try {
await getServiceInProject(uuid, projectUuid);
return { uuid, kind: 'service' };
} catch (e) {
if (e instanceof TenantError) {
return NextResponse.json({ error: e.message }, { status: 403 });
}
return NextResponse.json({ error: `App or service ${uuid} not found in this workspace` }, { status: 404 });
}
}
/**
* apps.containers.up — `docker compose up -d` against the rendered
* compose dir on the Coolify host.
*
* Use when Coolify's queued-start left the stack in "Created" or
* "no containers" state, or after editing env vars / domains to
* apply the changes (compose env file is regenerated; containers
* need to be recreated to pick it up).
*
* Idempotent — already-running containers are no-op'd. Returns
* `{ ok, code, stdout, stderr, durationMs }` so agents can show the
* user what happened.
*/
async function toolAppsContainersUp(principal: Principal, params: Record<string, any>) {
const uuid = String(params.uuid ?? params.appUuid ?? '').trim();
if (!uuid) return NextResponse.json({ error: 'Param "uuid" is required' }, { status: 400 });
if (!isCoolifySshConfigured()) {
return NextResponse.json({ error: 'apps.containers.up requires SSH to the Coolify host' }, { status: 501 });
}
const resolved = await resolveAppOrService(principal, uuid);
if (resolved instanceof NextResponse) return resolved;
const t0 = Date.now();
const r = await composeUp(resolved.kind, resolved.uuid, { timeoutMs: 600_000 });
return NextResponse.json({
result: {
ok: r.code === 0,
code: r.code,
stdout: r.stdout.slice(-4000),
stderr: r.stderr.slice(-4000),
truncated: r.truncated,
durationMs: Date.now() - t0,
},
});
}
/**
* apps.containers.ps — `docker compose ps -a` for diagnostics.
*
* Returns a one-line-per-container summary including names, image,
* state, and exit codes. Use to check whether containers are stuck
* in `Created` (Coolify queued-start failure) vs `Exited` (app crash)
* vs `Restarting` (boot loop).
*/
async function toolAppsContainersPs(principal: Principal, params: Record<string, any>) {
const uuid = String(params.uuid ?? params.appUuid ?? '').trim();
if (!uuid) return NextResponse.json({ error: 'Param "uuid" is required' }, { status: 400 });
if (!isCoolifySshConfigured()) {
return NextResponse.json({ error: 'apps.containers.ps requires SSH to the Coolify host' }, { status: 501 });
}
const resolved = await resolveAppOrService(principal, uuid);
if (resolved instanceof NextResponse) return resolved;
const r = await composePs(resolved.kind, resolved.uuid);
return NextResponse.json({
result: {
ok: r.code === 0,
stdout: r.stdout.slice(-4000),
stderr: r.stderr.slice(-2000),
},
});
}
// ────────────────────────────────────────────────── // ──────────────────────────────────────────────────
// apps.templates.* — Coolify one-click catalog browse // apps.templates.* — Coolify one-click catalog browse
// ────────────────────────────────────────────────── // ──────────────────────────────────────────────────
@@ -1106,6 +1217,72 @@ async function toolAppsTemplatesSearch(params: Record<string, any>) {
return NextResponse.json({ result: { items } }); return NextResponse.json({ result: { items } });
} }
/**
* Ensure a Coolify Service is actually running (containers exist and
* are healthy/starting), with a fallback path for Coolify's flaky
* queued-start worker.
*
* Strategy:
* 1. Call POST /services/{uuid}/start so Coolify's records show
* "starting" and any internal hooks fire.
* 2. Wait briefly, then probe the host for any container belonging
* to this service via `docker ps --filter name={uuid}`.
* 3. If no containers materialised, run `docker compose up -d`
* directly via SSH against the rendered compose dir. This is
* the same command Coolify's worker would run; we just bypass
* the unreliable queue.
*
* Returns:
* started true if at least one container is running for this service
* startMethod which path got us there
* diag human-readable note for failures (truncated stderr)
*/
async function ensureServiceUp(uuid: string): Promise<{
started: boolean;
startMethod: 'coolify-queue' | 'compose-up' | 'failed';
diag: string;
}> {
// 1. Ask Coolify nicely
try {
await startService(uuid);
} catch (e) {
console.warn('[ensureServiceUp] startService failed (will fall back)', e);
}
// 2. Probe — has the queue actually started anything?
if (!isCoolifySshConfigured()) {
return { started: true, startMethod: 'coolify-queue', diag: '' };
}
// Allow up to ~12s for the worker to wake up; checking every 3s.
for (let i = 0; i < 4; i++) {
await new Promise(r => setTimeout(r, 3_000));
try {
const probe = await runOnCoolifyHost(
`docker ps --filter name=${uuid} --format '{{.Names}}'`,
{ timeoutMs: 8_000 },
);
if (probe.stdout.trim().length > 0) {
return { started: true, startMethod: 'coolify-queue', diag: '' };
}
} catch (e) {
console.warn('[ensureServiceUp] probe failed', e);
}
}
// 3. Fallback — run docker compose up -d ourselves
try {
const r = await composeUp('service', uuid, { timeoutMs: 600_000 });
if (r.code === 0) {
return { started: true, startMethod: 'compose-up', diag: '' };
}
// Non-zero exit but compose ran — capture the tail for diagnosis
const tail = (r.stderr || r.stdout).trim().slice(-400);
return { started: false, startMethod: 'failed', diag: tail };
} catch (e) {
return { started: false, startMethod: 'failed', diag: e instanceof Error ? e.message : String(e) };
}
}
/** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */ /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */
function resolveFqdn(domainParam: unknown, slug: string, appName: string): string | NextResponse { function resolveFqdn(domainParam: unknown, slug: string, appName: string): string | NextResponse {
const fqdn = String(domainParam ?? '').trim() const fqdn = String(domainParam ?? '').trim()

126
lib/coolify-compose.ts Normal file
View File

@@ -0,0 +1,126 @@
/**
* Bring a Coolify Service or compose Application up via raw
* `docker compose up -d`.
*
* Why this exists
* ---------------
* Coolify's `POST /services/{uuid}/start` and `POST /deploy` endpoints
* write the rendered docker-compose.yml + .env to
* `/data/coolify/services/{uuid}/` (or `applications/{uuid}/` for
* compose apps), then enqueue a Laravel job to run
* `docker compose up -d`. In practice that worker queue is unreliable:
* it routinely returns "Service starting request queued" and then
* never actually invokes docker compose. The user's stack just sits
* there with rendered files and no containers.
*
* For a hands-off SaaS we can't ship that experience. This helper
* does the work directly via SSH, so a single MCP `apps.create` call
* really does leave a running app.
*
* Permissions model
* -----------------
* The `vibn-logs` SSH user (created by deploy/setup-coolify-ssh.sh)
* is in the `docker` group but has no shell sudo. It also can't read
* `/data/coolify/services/` directly because Coolify chmods that to
* 700 root. We work around both constraints by running the docker
* CLI inside a one-shot container that bind-mounts the path. The
* docker daemon runs as root so it can read the directory; the
* `vibn-logs` user only needs `docker` socket access.
*/
import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh';
/** Slug for the Coolify-managed compose dir. */
export type ResourceKind = 'service' | 'application';
function composeDir(kind: ResourceKind, uuid: string): string {
// Coolify v4 path layout — these are stable across the v4 line.
return kind === 'service'
? `/data/coolify/services/${uuid}`
: `/data/coolify/applications/${uuid}`;
}
/** Shell-quote a single argument as a POSIX single-quoted string. */
function sq(s: string): string {
return `'${String(s).replace(/'/g, `'\\''`)}'`;
}
/**
* Run a `docker compose` subcommand inside the rendered compose
* directory using a one-shot `docker:cli` container. Falls back to
* pulling the image on the first call.
*
* The `docker:cli` image (~50MB) is the official Docker CLI without
* the daemon. By bind-mounting the host docker socket it talks to
* the host's daemon, so containers it creates are first-class
* children of the same Docker engine — exactly what we want.
*/
async function composeRun(
kind: ResourceKind,
uuid: string,
args: string[],
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
const dir = composeDir(kind, uuid);
// Use --workdir + bind-mount so docker compose finds compose.yml + .env
// automatically. The `--rm` cleans the helper container after each call.
const cmd = [
'docker', 'run', '--rm',
'-v', sq(`${dir}:/work`),
'-w', '/work',
'-v', '/var/run/docker.sock:/var/run/docker.sock',
'--network', 'host',
'docker:cli',
'compose', ...args.map(sq),
].join(' ');
return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 });
}
/**
* `docker compose up -d` for a Coolify service or compose app.
*
* Idempotent — Compose already-running containers are no-op'd.
* Returns the raw SSH result so callers can surface diagnostics on
* failure (most common: image-pull errors, port conflicts).
*/
export async function composeUp(
kind: ResourceKind,
uuid: string,
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts);
}
/** `docker compose down` — stops + removes containers; volumes preserved. */
export async function composeDown(
kind: ResourceKind,
uuid: string,
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['down'], opts);
}
/** `docker compose ps -a` — useful for diagnosing why up didn't yield healthy containers. */
export async function composePs(
kind: ResourceKind,
uuid: string,
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 });
}
/**
* Verify the rendered compose dir exists before trying to run docker
* compose against it. Returns a friendly null-on-missing instead of
* an opaque ENOENT.
*/
export async function composeDirExists(
kind: ResourceKind,
uuid: string,
): Promise<boolean> {
// We can't `ls` the dir directly (perm denied), but a docker bind-mount
// probe will fail-closed if the path is missing.
const dir = composeDir(kind, uuid);
const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`;
const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 });
return r.stdout.trim().endsWith('OK');
}