fix(mcp v2.4.2): apps.create reports started=true on partial sidecar failure

Coolify's `compose up -d` returns non-zero whenever any sidecar container hits a `depends_on: condition: service_healthy` timeout. For slow-booting apps like Twenty (where the worker waits ~3 min for twenty's healthcheck), this caused apps.create to return started=false even when the primary stack was running fine. Now ensureServiceUp probes the host with `docker ps` after a non-zero compose exit and returns started=true whenever any container is running, surfacing the compose stderr in startDiag so agents can decide whether to retry apps.containers.up later. Made-with: Cursor
2026-04-23 20:12:03 -07:00
parent 62cb77b5a7
commit efb2082400
1 changed files with 29 additions and 7 deletions
--- a/app/api/mcp/route.ts
+++ b/app/api/mcp/route.ts
@@ -86,7 +86,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
 export async function GET() {
  return NextResponse.json({
    name: 'vibn-mcp',
-    version: '2.4.1',
+    version: '2.4.2',
    authentication: {
      scheme: 'Bearer',
      tokenPrefix: 'vibn_sk_',
@@ -893,8 +893,8 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
        startMethod,
        ...(startDiag ? { startDiag } : {}),
        note: started
-          ? 'Containers are up. First boot may take 1-5 min while images finish pulling and migrations run. Use apps.logs to monitor.'
-          : 'Service created but containers did not start. Call apps.containers.up to retry, or apps.logs to diagnose.',
+          ? 'Primary containers are up. First boot may take 1-5 min while images finish pulling and migrations run; use apps.logs to monitor. If startDiag mentions a sidecar dependency timeout (workers, schedulers), call apps.containers.up again once the primary is healthy to bring those up.'
+          : 'Service created but no containers started. Call apps.containers.up to retry; check apps.containers.ps and apps.logs to diagnose.',
      },
    });
  }
@@ -1270,17 +1270,39 @@ async function ensureServiceUp(uuid: string): Promise<{
  }

  // 3. Fallback — run docker compose up -d ourselves
+  let composeDiag = '';
  try {
    const r = await composeUp('service', uuid, { timeoutMs: 600_000 });
+    composeDiag = (r.stderr || r.stdout).trim().slice(-400);
    if (r.code === 0) {
      return { started: true, startMethod: 'compose-up', diag: '' };
    }
-    // Non-zero exit but compose ran — capture the tail for diagnosis
-    const tail = (r.stderr || r.stdout).trim().slice(-400);
-    return { started: false, startMethod: 'failed', diag: tail };
+    // Non-zero exit DOES NOT mean nothing started. Compose returns
+    // non-zero whenever any service hits a `depends_on:
+    // condition: service_healthy` timeout — common for sidecar
+    // containers (workers, schedulers) of apps with slow-booting
+    // primary services (Twenty's worker waits on twenty's healthcheck,
+    // which takes 2-5 min). Probe the host to see what's actually
+    // running before declaring failure.
  } catch (e) {
-    return { started: false, startMethod: 'failed', diag: e instanceof Error ? e.message : String(e) };
+    composeDiag = e instanceof Error ? e.message : String(e);
  }
+
+  try {
+    const probe = await runOnCoolifyHost(
+      `docker ps --filter name=${uuid} --format '{{.Names}}'`,
+      { timeoutMs: 8_000 },
+    );
+    if (probe.stdout.trim().length > 0) {
+      // Something IS running — partial success. Surface the diag so
+      // agents see WHY compose returned non-zero (usually a sidecar
+      // depends_on timeout) but report started=true so happy-path
+      // workflows don't panic.
+      return { started: true, startMethod: 'compose-up', diag: composeDiag };
+    }
+  } catch { /* fall through */ }
+
+  return { started: false, startMethod: 'failed', diag: composeDiag };
 }

 /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */