fix(mcp v2.4.5): post-deploy fixes replace SSH compose-up fallback

apps.create for service templates now lets Coolify's queue do the full deploy (compose generation, volumes, internal networking, healthchecks) and applies three surgical post-deploy fixes that Coolify's REST API does NOT expose: 1. Rewrites SERVICE_FQDN_* / SERVICE_URL_* in the rendered .env so frontends that bake their backend URL into the SPA bundle (Twenty's SERVER_URL, n8n, etc.) point at the real custom domain instead of the auto-generated sslip.io URL. Without this fix Twenty's frontend loads on the real HTTPS domain but fires XHRs at insecure sslip.io, blocking everything as Mixed Content. 2. Injects the missing traefik.http.services.<svc>.loadbalancer.server.port label. Coolify generates the routing rules but forgets the port, so Traefik logs "error: port is missing" and returns 503 forever. 3. Connects coolify-proxy to the project network (Coolify writes a caddy_ingress_network=<uuid> hint label but never actually runs docker network connect), then force-recreates ONLY the public-facing container so the new env+label apply, and restarts the proxy so Traefik re-discovers. Polling switches from service.status (which routinely lies as "starting:unknown" while containers are actually healthy) to the truthful per-application service.applications[*].status field. Removes the SSH "docker compose up -d" fallback that v2.4.1-2.4.4 used. That fallback bypassed Coolify's full pipeline, causing internal services like Postgres/Redis to land on the shared coolify network where DNS aliases collided with coolify-db/coolify-redis, producing the "password authentication failed" regression we saw on Twenty deploys. With v2.4.5 internal services stay on their isolated project network — only the public app crosses to the proxy. Response shape gains: reachable (boolean for HTTPS 2xx/3xx), appStatus (truthful per-app status from Coolify), postDeploy (step-by-step diagnostic for each of the three fixes). Existing started/startDiag fields kept for back-compat. apps.containers.up / apps.containers.ps remain unchanged for manual user recovery. Made-with: Cursor
2026-04-27 14:04:18 -07:00
parent d6b8ba4d67
commit 247b31bf2f
2 changed files with 459 additions and 188 deletions
--- a/app/api/mcp/route.ts
+++ b/app/api/mcp/route.ts
@@ -32,7 +32,8 @@ import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh';
 import {
  composeUp,
  composePs,
-  attachToCoolifyProxyNetwork,
+  applyCoolifyPostDeployFixes,
+  type CoolifyPostDeployResult,
  type ResourceKind,
 } from '@/lib/coolify-compose';
 import { listContainersForApp } from '@/lib/coolify-containers';
@@ -91,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
 export async function GET() {
  return NextResponse.json({
    name: 'vibn-mcp',
-    version: '2.4.4',
+    version: '2.4.5',
    authentication: {
      scheme: 'Bearer',
      tokenPrefix: 'vibn_sk_',
@@ -879,10 +880,16 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
    }

    let started = false;
-    let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
+    let reachable = false;
+    let appStatus = 'unknown';
+    let postDeploy: CoolifyPostDeployResult | null = null;
    let startDiag = '';
    if (params.instantDeploy !== false) {
-      ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
+      ({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({
+        uuid: created.uuid,
+        fqdn,
+        publicAppName: templateSlug,
+      }));
    }

    return NextResponse.json({
@@ -895,11 +902,15 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
        template: templateSlug,
        urlsApplied,
        started,
-        startMethod,
+        reachable,
+        appStatus,
+        ...(postDeploy ? { postDeploy } : {}),
        ...(startDiag ? { startDiag } : {}),
-        note: started
-          ? 'Primary containers are up. First boot may take 1-5 min while images finish pulling and migrations run; use apps.logs to monitor. If startDiag mentions a sidecar dependency timeout (workers, schedulers), call apps.containers.up again once the primary is healthy to bring those up.'
-          : 'Service created but no containers started. Call apps.containers.up to retry; check apps.containers.ps and apps.logs to diagnose.',
+        note: reachable
+          ? `Reachable on https://${fqdn}. First boot may continue migrations in the background — check apps.logs if any feature seems missing.`
+          : started
+            ? `Containers are healthy but https://${fqdn} did not return 2xx/3xx yet. Wait 30-60s for Traefik to fully discover labels, then retry. If still failing, inspect postDeploy.steps for which fix didn't apply, then call apps.logs and apps.containers.ps.`
+            : `Public app did not become healthy. Use apps.containers.ps and apps.logs to diagnose. Most common cause: image pull is still in progress (first deploy can take 5-10 min for large images like twentycrm/twenty).`,
      },
    });
  }
@@ -956,11 +967,23 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
      }
    }

+    // composeRaw is user-supplied — we can't reliably guess the public
+    // app name (the user may have any compose service layout). Best
+    // effort: use the app name as the public app name, which works for
+    // single-container composes.
    let started = false;
-    let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
+    let reachable = false;
+    let appStatus = 'unknown';
+    let postDeploy: CoolifyPostDeployResult | null = null;
    let startDiag = '';
    if (params.instantDeploy !== false) {
-      ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
+      const publicAppName = String(params.publicAppName ?? appName);
+      ({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({
+        uuid: created.uuid,
+        fqdn,
+        publicAppName,
+        port: params.port ? Number(params.port) : undefined,
+      }));
    }

    return NextResponse.json({
@@ -971,9 +994,13 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
        url: `https://${fqdn}`,
        resourceType: 'service',
        started,
-        startMethod,
+        reachable,
+        appStatus,
+        ...(postDeploy ? { postDeploy } : {}),
        ...(startDiag ? { startDiag } : {}),
-        note: 'Domain routing for compose services must be configured after initial startup — set SERVER_URL env to the desired URL, then call apps.containers.up to apply.',
+        note: reachable
+          ? `Reachable on https://${fqdn}.`
+          : `Domain routing for custom compose services depends on knowing which docker-compose service is the public-facing one. Pass publicAppName=<service> and port=<port> on apps.create to enable post-deploy patching, or set them manually.`,
      },
    });
  }
@@ -1223,106 +1250,138 @@ async function toolAppsTemplatesSearch(params: Record<string, any>) {
 }

 /**
- * Ensure a Coolify Service is actually running (containers exist and
- * are healthy/starting), with a fallback path for Coolify's flaky
- * queued-start worker.
+ * Bring a Coolify Service to a publicly-reachable state.
 *
- * Strategy:
- *   1. Call POST /services/{uuid}/start so Coolify's records show
- *      "starting" and any internal hooks fire.
- *   2. Wait briefly, then probe the host for any container belonging
- *      to this service via `docker ps --filter name={uuid}`.
- *   3. If no containers materialised, run `docker compose up -d`
- *      directly via SSH against the rendered compose dir. This is
- *      the same command Coolify's worker would run; we just bypass
- *      the unreliable queue.
+ * v2.4.5 architecture
+ * --------------------
+ * Earlier versions ran `docker compose up -d` over SSH as a fallback
+ * when Coolify's queue stalled. That worked for "containers running"
+ * but caused two cascading bugs because it bypassed Coolify's full
+ * deploy pipeline:
+ *   - Internal services (Postgres, Redis) ended up on the shared
+ *     `coolify` Docker network, where DNS aliases for `postgres`/
+ *     `redis` collide with Coolify's own `coolify-db`/`coolify-redis`
+ *     containers — Twenty's `postgres://postgres:5432/twenty-db`
+ *     resolves to the wrong DB and fails auth.
+ *   - The proxy-network attach we did in our SSH path attached EVERY
+ *     container, magnifying the same DNS collision.
 *
- * Returns:
- *   started      true if at least one container is running for this service
- *   startMethod  which path got us there
- *   diag         human-readable note for failures (truncated stderr)
+ * The right model is: let Coolify's queue do the heavy lifting (it
+ * handles compose generation, volumes, internal networking, env-var
+ * substitution, healthchecks, etc.) and patch the three things its
+ * REST API does NOT expose:
+ *   1. SERVICE_FQDN_* / SERVICE_URL_* env vars in the rendered .env
+ *   2. The missing traefik loadbalancer.server.port label
+ *   3. coolify-proxy → project network attachment + Traefik nudge
+ *
+ * Steps:
+ *   1. POST /services/{uuid}/start — Coolify's queue does its thing.
+ *   2. Poll service.applications[*].status (the per-application
+ *      status is truthful; service.status is not). Wait until the
+ *      public app reports running:healthy or we time out.
+ *   3. apply post-deploy fixes: rewrite .env, inject port label,
+ *      attach proxy to project net, recreate ONLY the public app,
+ *      restart proxy so Traefik re-discovers.
+ *   4. (Optional) probe https://<fqdn> for a 200/301/302 to confirm
+ *      end-to-end reachability.
 */
-async function ensureServiceUp(uuid: string): Promise<{
+async function ensureServiceReachable(opts: {
+  uuid: string;
+  fqdn: string;
+  publicAppName: string;
+  port?: number;
+  /** Max wall-clock time to wait for Coolify to bring containers healthy. */
+  healthTimeoutMs?: number;
+}): Promise<{
  started: boolean;
-  startMethod: 'coolify-queue' | 'compose-up' | 'failed';
+  reachable: boolean;
+  appStatus: string;
+  postDeploy: CoolifyPostDeployResult | null;
  diag: string;
 }> {
-  // 1. Ask Coolify nicely
+  const { uuid, fqdn, publicAppName, port, healthTimeoutMs = 8 * 60_000 } = opts;
+
  try {
    await startService(uuid);
  } catch (e) {
-    console.warn('[ensureServiceUp] startService failed (will fall back)', e);
+    console.warn('[ensureServiceReachable] startService failed', e);
  }

-  // 2. Probe — has the queue actually started anything?
-  if (!isCoolifySshConfigured()) {
-    return { started: true, startMethod: 'coolify-queue', diag: '' };
-  }
-  // Allow up to ~12s for the worker to wake up; checking every 3s.
-  for (let i = 0; i < 4; i++) {
-    await new Promise(r => setTimeout(r, 3_000));
+  // Poll service.applications[*].status until the public app is
+  // running:healthy. This field is truthful, unlike service.status
+  // which routinely lies as "starting:unknown" while containers are
+  // actually healthy.
+  const startedAt = Date.now();
+  let appStatus = 'unknown';
+  while (Date.now() - startedAt < healthTimeoutMs) {
    try {
-      const probe = await runOnCoolifyHost(
-        `docker ps --filter name=${uuid} --format '{{.Names}}'`,
-        { timeoutMs: 8_000 },
-      );
-      if (probe.stdout.trim().length > 0) {
-        // Coolify started the stack. Even on this happy path we still
-        // need to ensure the proxy-network attachment ran, since
-        // Coolify only attaches at the end of its full deploy
-        // pipeline (which can be skipped if a sidecar fails to come
-        // up). Idempotent — already-attached containers are no-ops.
-        await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ });
-        return { started: true, startMethod: 'coolify-queue', diag: '' };
+      const svc = (await getService(uuid)) as unknown as {
+        applications?: Array<{ name?: string; status?: string }>;
+      };
+      const apps = svc.applications ?? [];
+      const target = apps.find(a => a.name === publicAppName) ?? apps[0];
+      appStatus = target?.status ?? 'unknown';
+      if (/^running:healthy/i.test(appStatus)) break;
+      // Failure modes Coolify reports as terminal: exited (compose
+      // never ran), restarting (boot loop). We don't want to wait
+      // the full timeout in those cases.
+      if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) {
+        // Give it 90s to transition out of "exited" before declaring failure
+        break;
      }
    } catch (e) {
-      console.warn('[ensureServiceUp] probe failed', e);
+      console.warn('[ensureServiceReachable] status probe failed', e);
+    }
+    await new Promise(r => setTimeout(r, 8_000));
+  }
+
+  const started = /^running/i.test(appStatus);
+  if (!started) {
+    return {
+      started: false,
+      reachable: false,
+      appStatus,
+      postDeploy: null,
+      diag: `Public app "${publicAppName}" did not become healthy within ${Math.round(healthTimeoutMs/1000)}s (status=${appStatus}). Use apps.containers.ps and apps.logs to diagnose.`,
+    };
+  }
+
+  // Apply post-deploy fixes. Only meaningful when SSH is configured —
+  // without it we can't rewrite the .env or attach proxy networks.
+  let postDeploy: CoolifyPostDeployResult | null = null;
+  if (isCoolifySshConfigured()) {
+    try {
+      postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port });
+    } catch (e) {
+      console.warn('[ensureServiceReachable] post-deploy fix failed', e);
    }
  }

-  // 3. Fallback — run docker compose up -d ourselves
-  let composeDiag = '';
+  // Best-effort reachability probe. Public DNS for the workspace
+  // wildcard may not have propagated yet (esp. on first deploy in a
+  // brand-new workspace), so a non-200 here doesn't mean failure —
+  // it just means "agents should retry the URL in a few seconds".
+  let reachable = false;
+  let probeDiag = '';
  try {
-    const r = await composeUp('service', uuid, { timeoutMs: 600_000 });
-    // Strip ANSI / control chars (compose progress output uses \r and
-    // ANSI escapes) so the diag survives JSON serialization cleanly.
-    composeDiag = (r.stderr || r.stdout)
-      .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '')
-      .replace(/[\x00-\x08\x0B-\x1F]/g, '')
-      .trim()
-      .slice(-400);
-    if (r.code === 0) {
-      return { started: true, startMethod: 'compose-up', diag: '' };
-    }
-    // Non-zero exit DOES NOT mean nothing started. Compose returns
-    // non-zero whenever any service hits a `depends_on:
-    // condition: service_healthy` timeout — common for sidecar
-    // containers (workers, schedulers) of apps with slow-booting
-    // primary services (Twenty's worker waits on twenty's healthcheck,
-    // which takes 2-5 min). Probe the host to see what's actually
-    // running before declaring failure.
+    const url = `https://${fqdn}`;
+    const ctrl = new AbortController();
+    const t = setTimeout(() => ctrl.abort(), 12_000);
+    const res = await fetch(url, { signal: ctrl.signal, redirect: 'manual' });
+    clearTimeout(t);
+    reachable = res.status >= 200 && res.status < 400;
+    probeDiag = `GET ${url} → ${res.status}`;
  } catch (e) {
-    composeDiag = e instanceof Error ? e.message : String(e);
+    probeDiag = `GET probe failed: ${e instanceof Error ? e.message : String(e)}`;
  }

-  try {
-    const probe = await runOnCoolifyHost(
-      `docker ps --filter name=${uuid} --format '{{.Names}}'`,
-      { timeoutMs: 8_000 },
-    );
-    if (probe.stdout.trim().length > 0) {
-      // Something IS running — partial success. Surface the diag so
-      // agents see WHY compose returned non-zero (usually a sidecar
-      // depends_on timeout) but report started=true so happy-path
-      // workflows don't panic. composeUp already attached the proxy
-      // network, but call once more to cover any container that came
-      // up after the initial attach pass.
-      await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ });
-      return { started: true, startMethod: 'compose-up', diag: composeDiag };
-    }
-  } catch { /* fall through */ }
-
-  return { started: false, startMethod: 'failed', diag: composeDiag };
+  return {
+    started: true,
+    reachable,
+    appStatus,
+    postDeploy,
+    diag: probeDiag,
+  };
 }

 /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */