From d1f8c3d34bd3b5fd0b25abf1b375e86a602c1f08 Mon Sep 17 00:00:00 2001
From: Mark Henderson <mark@getacquired.com>
Date: Mon, 27 Apr 2026 14:30:27 -0700
Subject: [PATCH] fix(mcp v2.4.6): poll robustness + apps.repair recovery tool

Two fixes for transient Coolify queue lag observed when smoke-testing
v2.4.5:

1. ensureServiceReachable no longer false-fails on early `exited` status.
   Coolify's queue worker can take 60-120s to dequeue a `start` request;
   during that window service.applications[*].status returns the stale
   `exited` (= "never started") state. Previously the polling loop
   treated that as terminal failure after 90s, returning started:false
   on stacks that were about to come up healthy.

   The new logic requires "evidence of activity" (status starting:* or
   running:* seen at least once) before treating subsequent `exited`
   reports as terminal. Until activity is observed, the loop just keeps
   polling up to the 8-min health timeout.

2. apps.repair (new tool). Re-runs the three post-deploy patches
   (env rewrite, traefik port label, coolify-proxy network attach +
   force-recreate + proxy restart) against an existing service without
   recreating it. Useful when:
     - apps.create returned started:false but containers eventually
       came up (now the polling fix should make this rare)
     - a deploy succeeded mechanically but is serving Traefik 503 or
       Mixed Content
     - a user rotates a custom domain on an existing app

   Params: { uuid, fqdn, publicAppName, port? }
   Returns: { reachable, postDeploy: { steps }, probe }

Version bumped to 2.4.6.

Made-with: Cursor
---
 app/api/mcp/route.ts | 102 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 7 deletions(-)
diff --git a/app/api/mcp/route.ts b/app/api/mcp/route.ts
index 79c2935c..426c2767 100644
--- a/app/api/mcp/route.ts
+++ b/app/api/mcp/route.ts
@@ -92,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
 export async function GET() {
   return NextResponse.json({
     name: 'vibn-mcp',
-    version: '2.4.5',
+    version: '2.4.6',
     authentication: {
       scheme: 'Bearer',
       tokenPrefix: 'vibn_sk_',
@@ -124,6 +124,7 @@ export async function GET() {
           'apps.volumes.wipe',
           'apps.containers.up',
           'apps.containers.ps',
+          'apps.repair',
           'apps.templates.list',
           'apps.templates.search',
           'apps.envs.list',
@@ -230,6 +231,8 @@ export async function POST(request: Request) {
         return await toolAppsContainersUp(principal, params);
       case 'apps.containers.ps':
         return await toolAppsContainersPs(principal, params);
+      case 'apps.repair':
+        return await toolAppsRepair(principal, params);
       case 'apps.templates.list':
         return await toolAppsTemplatesList(params);
       case 'apps.templates.search':
@@ -1179,6 +1182,78 @@ async function toolAppsContainersPs(principal: Principal, params: Record<string,
   });
 }
 
+/**
+ * apps.repair — re-run post-deploy patches against an existing service.
+ *
+ * Use this when a service is running but unreachable on its custom
+ * domain (typical Traefik 503 / Mixed Content symptoms). It applies
+ * the same three fixes apps.create runs on a fresh deploy:
+ *
+ *   1. Rewrite SERVICE_FQDN_* / SERVICE_URL_* in the service .env so
+ *      Coolify regen no longer overwrites them with sslip.io defaults.
+ *   2. Inject the missing traefik.http.services.<svc>.loadbalancer.
+ *      server.port label into docker-compose.yml.
+ *   3. Connect coolify-proxy to the service's project network.
+ *   4. Force-recreate the public-facing app container.
+ *   5. Restart coolify-proxy so Traefik re-discovers labels.
+ *
+ * Params:
+ *   uuid           required — service uuid (the resource, not a single container)
+ *   fqdn           required — the public hostname (e.g. "crm.mark.vibnai.com")
+ *   publicAppName  required — docker-compose service name of the public app
+ *                  (usually equals the template slug: "twenty", "n8n", …)
+ *   port           optional — internal port (default: derived per template)
+ *
+ * Returns the same { ok, steps } shape as the post-deploy block in
+ * apps.create plus a final reachability probe.
+ */
+async function toolAppsRepair(_principal: Principal, params: Record<string, any>) {
+  const uuid = String(params.uuid ?? '').trim();
+  const fqdn = String(params.fqdn ?? '').trim();
+  const publicAppName = String(params.publicAppName ?? '').trim();
+  const port = params.port != null ? Number(params.port) : undefined;
+  if (!uuid || !fqdn || !publicAppName) {
+    return NextResponse.json(
+      { error: 'apps.repair requires { uuid, fqdn, publicAppName }' },
+      { status: 400 }
+    );
+  }
+  if (!isCoolifySshConfigured()) {
+    return NextResponse.json(
+      { error: 'apps.repair requires SSH to the Coolify host (set COOLIFY_SSH_*)' },
+      { status: 501 }
+    );
+  }
+  const postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port });
+
+  let reachable = false;
+  let probeDiag = '';
+  try {
+    const ctrl = new AbortController();
+    const t = setTimeout(() => ctrl.abort(), 12_000);
+    const res = await fetch(`https://${fqdn}`, { signal: ctrl.signal, redirect: 'manual' });
+    clearTimeout(t);
+    reachable = res.status >= 200 && res.status < 400;
+    probeDiag = `GET https://${fqdn} → ${res.status}`;
+  } catch (e) {
+    probeDiag = `probe failed: ${e instanceof Error ? e.message : String(e)}`;
+  }
+
+  return NextResponse.json({
+    result: {
+      uuid,
+      fqdn,
+      publicAppName,
+      reachable,
+      postDeploy,
+      probe: probeDiag,
+      note: reachable
+        ? `Repaired and reachable on https://${fqdn}.`
+        : `Repair steps applied but probe still failed. Check postDeploy.steps for any "ok: false" entries; otherwise wait 30s and retry the probe.`,
+    },
+  });
+}
+
 // ──────────────────────────────────────────────────
 // apps.templates.* — Coolify one-click catalog browse
 // ──────────────────────────────────────────────────
@@ -1311,8 +1386,15 @@ async function ensureServiceReachable(opts: {
   // running:healthy. This field is truthful, unlike service.status
   // which routinely lies as "starting:unknown" while containers are
   // actually healthy.
+  // Coolify's queue worker can take 60-120s to dequeue a start
+  // request, during which time service.applications[*].status still
+  // reports the stale `exited` state (= "never started"). We only
+  // treat `exited` as terminal AFTER we've seen evidence of activity
+  // (`starting:*` or `running:*`) — otherwise it's just queue lag.
   const startedAt = Date.now();
   let appStatus = 'unknown';
+  let sawActivity = false;
+  let lastExitObservedAt = 0;
   while (Date.now() - startedAt < healthTimeoutMs) {
     try {
       const svc = (await getService(uuid)) as unknown as {
@@ -1322,12 +1404,18 @@ async function ensureServiceReachable(opts: {
       const target = apps.find(a => a.name === publicAppName) ?? apps[0];
       appStatus = target?.status ?? 'unknown';
       if (/^running:healthy/i.test(appStatus)) break;
-      // Failure modes Coolify reports as terminal: exited (compose
-      // never ran), restarting (boot loop). We don't want to wait
-      // the full timeout in those cases.
-      if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) {
-        // Give it 90s to transition out of "exited" before declaring failure
-        break;
+      if (/^starting|^running/i.test(appStatus)) {
+        sawActivity = true;
+        lastExitObservedAt = 0;
+      }
+      // Once we've seen activity, an exited status is terminal —
+      // boot loop or compose failure. Wait 30s of consecutive
+      // `exited` to be sure it's not a Compose recreate cycle.
+      if (sawActivity && /^exited/i.test(appStatus)) {
+        if (lastExitObservedAt === 0) lastExitObservedAt = Date.now();
+        if (Date.now() - lastExitObservedAt > 30_000) break;
+      } else if (!/^exited/i.test(appStatus)) {
+        lastExitObservedAt = 0;
       }
     } catch (e) {
       console.warn('[ensureServiceReachable] status probe failed', e);