fix(mcp v2.4.6): poll robustness + apps.repair recovery tool

Two fixes for transient Coolify queue lag observed when smoke-testing v2.4.5: 1. ensureServiceReachable no longer false-fails on early `exited` status. Coolify's queue worker can take 60-120s to dequeue a `start` request; during that window service.applications[*].status returns the stale `exited` (= "never started") state. Previously the polling loop treated that as terminal failure after 90s, returning started:false on stacks that were about to come up healthy. The new logic requires "evidence of activity" (status starting:* or running:* seen at least once) before treating subsequent `exited` reports as terminal. Until activity is observed, the loop just keeps polling up to the 8-min health timeout. 2. apps.repair (new tool). Re-runs the three post-deploy patches (env rewrite, traefik port label, coolify-proxy network attach + force-recreate + proxy restart) against an existing service without recreating it. Useful when: - apps.create returned started:false but containers eventually came up (now the polling fix should make this rare) - a deploy succeeded mechanically but is serving Traefik 503 or Mixed Content - a user rotates a custom domain on an existing app Params: { uuid, fqdn, publicAppName, port? } Returns: { reachable, postDeploy: { steps }, probe } Version bumped to 2.4.6. Made-with: Cursor
2026-04-27 14:30:27 -07:00
parent 247b31bf2f
commit d1f8c3d34b
1 changed files with 95 additions and 7 deletions
--- a/app/api/mcp/route.ts
+++ b/app/api/mcp/route.ts
@@ -92,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
 export async function GET() {
  return NextResponse.json({
    name: 'vibn-mcp',
-    version: '2.4.5',
+    version: '2.4.6',
    authentication: {
      scheme: 'Bearer',
      tokenPrefix: 'vibn_sk_',
@@ -124,6 +124,7 @@ export async function GET() {
          'apps.volumes.wipe',
          'apps.containers.up',
          'apps.containers.ps',
+          'apps.repair',
          'apps.templates.list',
          'apps.templates.search',
          'apps.envs.list',
@@ -230,6 +231,8 @@ export async function POST(request: Request) {
        return await toolAppsContainersUp(principal, params);
      case 'apps.containers.ps':
        return await toolAppsContainersPs(principal, params);
+      case 'apps.repair':
+        return await toolAppsRepair(principal, params);
      case 'apps.templates.list':
        return await toolAppsTemplatesList(params);
      case 'apps.templates.search':
@@ -1179,6 +1182,78 @@ async function toolAppsContainersPs(principal: Principal, params: Record<string,
  });
 }

+/**
+ * apps.repair — re-run post-deploy patches against an existing service.
+ *
+ * Use this when a service is running but unreachable on its custom
+ * domain (typical Traefik 503 / Mixed Content symptoms). It applies
+ * the same three fixes apps.create runs on a fresh deploy:
+ *
+ *   1. Rewrite SERVICE_FQDN_* / SERVICE_URL_* in the service .env so
+ *      Coolify regen no longer overwrites them with sslip.io defaults.
+ *   2. Inject the missing traefik.http.services.<svc>.loadbalancer.
+ *      server.port label into docker-compose.yml.
+ *   3. Connect coolify-proxy to the service's project network.
+ *   4. Force-recreate the public-facing app container.
+ *   5. Restart coolify-proxy so Traefik re-discovers labels.
+ *
+ * Params:
+ *   uuid           required — service uuid (the resource, not a single container)
+ *   fqdn           required — the public hostname (e.g. "crm.mark.vibnai.com")
+ *   publicAppName  required — docker-compose service name of the public app
+ *                  (usually equals the template slug: "twenty", "n8n", …)
+ *   port           optional — internal port (default: derived per template)
+ *
+ * Returns the same { ok, steps } shape as the post-deploy block in
+ * apps.create plus a final reachability probe.
+ */
+async function toolAppsRepair(_principal: Principal, params: Record<string, any>) {
+  const uuid = String(params.uuid ?? '').trim();
+  const fqdn = String(params.fqdn ?? '').trim();
+  const publicAppName = String(params.publicAppName ?? '').trim();
+  const port = params.port != null ? Number(params.port) : undefined;
+  if (!uuid || !fqdn || !publicAppName) {
+    return NextResponse.json(
+      { error: 'apps.repair requires { uuid, fqdn, publicAppName }' },
+      { status: 400 }
+    );
+  }
+  if (!isCoolifySshConfigured()) {
+    return NextResponse.json(
+      { error: 'apps.repair requires SSH to the Coolify host (set COOLIFY_SSH_*)' },
+      { status: 501 }
+    );
+  }
+  const postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port });
+
+  let reachable = false;
+  let probeDiag = '';
+  try {
+    const ctrl = new AbortController();
+    const t = setTimeout(() => ctrl.abort(), 12_000);
+    const res = await fetch(`https://${fqdn}`, { signal: ctrl.signal, redirect: 'manual' });
+    clearTimeout(t);
+    reachable = res.status >= 200 && res.status < 400;
+    probeDiag = `GET https://${fqdn} → ${res.status}`;
+  } catch (e) {
+    probeDiag = `probe failed: ${e instanceof Error ? e.message : String(e)}`;
+  }
+
+  return NextResponse.json({
+    result: {
+      uuid,
+      fqdn,
+      publicAppName,
+      reachable,
+      postDeploy,
+      probe: probeDiag,
+      note: reachable
+        ? `Repaired and reachable on https://${fqdn}.`
+        : `Repair steps applied but probe still failed. Check postDeploy.steps for any "ok: false" entries; otherwise wait 30s and retry the probe.`,
+    },
+  });
+}
+
 // ──────────────────────────────────────────────────
 // apps.templates.* — Coolify one-click catalog browse
 // ──────────────────────────────────────────────────
@@ -1311,8 +1386,15 @@ async function ensureServiceReachable(opts: {
  // running:healthy. This field is truthful, unlike service.status
  // which routinely lies as "starting:unknown" while containers are
  // actually healthy.
+  // Coolify's queue worker can take 60-120s to dequeue a start
+  // request, during which time service.applications[*].status still
+  // reports the stale `exited` state (= "never started"). We only
+  // treat `exited` as terminal AFTER we've seen evidence of activity
+  // (`starting:*` or `running:*`) — otherwise it's just queue lag.
  const startedAt = Date.now();
  let appStatus = 'unknown';
+  let sawActivity = false;
+  let lastExitObservedAt = 0;
  while (Date.now() - startedAt < healthTimeoutMs) {
    try {
      const svc = (await getService(uuid)) as unknown as {
@@ -1322,12 +1404,18 @@ async function ensureServiceReachable(opts: {
      const target = apps.find(a => a.name === publicAppName) ?? apps[0];
      appStatus = target?.status ?? 'unknown';
      if (/^running:healthy/i.test(appStatus)) break;
-      // Failure modes Coolify reports as terminal: exited (compose
-      // never ran), restarting (boot loop). We don't want to wait
-      // the full timeout in those cases.
-      if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) {
-        // Give it 90s to transition out of "exited" before declaring failure
-        break;
+      if (/^starting|^running/i.test(appStatus)) {
+        sawActivity = true;
+        lastExitObservedAt = 0;
+      }
+      // Once we've seen activity, an exited status is terminal —
+      // boot loop or compose failure. Wait 30s of consecutive
+      // `exited` to be sure it's not a Compose recreate cycle.
+      if (sawActivity && /^exited/i.test(appStatus)) {
+        if (lastExitObservedAt === 0) lastExitObservedAt = Date.now();
+        if (Date.now() - lastExitObservedAt > 30_000) break;
+      } else if (!/^exited/i.test(appStatus)) {
+        lastExitObservedAt = 0;
      }
    } catch (e) {
      console.warn('[ensureServiceReachable] status probe failed', e);