fix(mcp v2.4.2): apps.create reports started=true on partial sidecar failure

Coolify's `compose up -d` returns non-zero whenever any sidecar container
hits a `depends_on: condition: service_healthy` timeout. For slow-booting
apps like Twenty (where the worker waits ~3 min for twenty's healthcheck),
this caused apps.create to return started=false even when the primary
stack was running fine.

Now ensureServiceUp probes the host with `docker ps` after a non-zero
compose exit and returns started=true whenever any container is running,
surfacing the compose stderr in startDiag so agents can decide whether
to retry apps.containers.up later.

Made-with: Cursor
This commit is contained in:
2026-04-23 20:12:03 -07:00
parent 62cb77b5a7
commit efb2082400

View File

@@ -86,7 +86,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
export async function GET() { export async function GET() {
return NextResponse.json({ return NextResponse.json({
name: 'vibn-mcp', name: 'vibn-mcp',
version: '2.4.1', version: '2.4.2',
authentication: { authentication: {
scheme: 'Bearer', scheme: 'Bearer',
tokenPrefix: 'vibn_sk_', tokenPrefix: 'vibn_sk_',
@@ -893,8 +893,8 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
startMethod, startMethod,
...(startDiag ? { startDiag } : {}), ...(startDiag ? { startDiag } : {}),
note: started note: started
? 'Containers are up. First boot may take 1-5 min while images finish pulling and migrations run. Use apps.logs to monitor.' ? 'Primary containers are up. First boot may take 1-5 min while images finish pulling and migrations run; use apps.logs to monitor. If startDiag mentions a sidecar dependency timeout (workers, schedulers), call apps.containers.up again once the primary is healthy to bring those up.'
: 'Service created but containers did not start. Call apps.containers.up to retry, or apps.logs to diagnose.', : 'Service created but no containers started. Call apps.containers.up to retry; check apps.containers.ps and apps.logs to diagnose.',
}, },
}); });
} }
@@ -1270,17 +1270,39 @@ async function ensureServiceUp(uuid: string): Promise<{
} }
// 3. Fallback — run docker compose up -d ourselves // 3. Fallback — run docker compose up -d ourselves
let composeDiag = '';
try { try {
const r = await composeUp('service', uuid, { timeoutMs: 600_000 }); const r = await composeUp('service', uuid, { timeoutMs: 600_000 });
composeDiag = (r.stderr || r.stdout).trim().slice(-400);
if (r.code === 0) { if (r.code === 0) {
return { started: true, startMethod: 'compose-up', diag: '' }; return { started: true, startMethod: 'compose-up', diag: '' };
} }
// Non-zero exit but compose ran — capture the tail for diagnosis // Non-zero exit DOES NOT mean nothing started. Compose returns
const tail = (r.stderr || r.stdout).trim().slice(-400); // non-zero whenever any service hits a `depends_on:
return { started: false, startMethod: 'failed', diag: tail }; // condition: service_healthy` timeout — common for sidecar
// containers (workers, schedulers) of apps with slow-booting
// primary services (Twenty's worker waits on twenty's healthcheck,
// which takes 2-5 min). Probe the host to see what's actually
// running before declaring failure.
} catch (e) { } catch (e) {
return { started: false, startMethod: 'failed', diag: e instanceof Error ? e.message : String(e) }; composeDiag = e instanceof Error ? e.message : String(e);
} }
try {
const probe = await runOnCoolifyHost(
`docker ps --filter name=${uuid} --format '{{.Names}}'`,
{ timeoutMs: 8_000 },
);
if (probe.stdout.trim().length > 0) {
// Something IS running — partial success. Surface the diag so
// agents see WHY compose returned non-zero (usually a sidecar
// depends_on timeout) but report started=true so happy-path
// workflows don't panic.
return { started: true, startMethod: 'compose-up', diag: composeDiag };
}
} catch { /* fall through */ }
return { started: false, startMethod: 'failed', diag: composeDiag };
} }
/** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */ /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */