From d1f8c3d34bd3b5fd0b25abf1b375e86a602c1f08 Mon Sep 17 00:00:00 2001 From: Mark Henderson Date: Mon, 27 Apr 2026 14:30:27 -0700 Subject: [PATCH] fix(mcp v2.4.6): poll robustness + apps.repair recovery tool Two fixes for transient Coolify queue lag observed when smoke-testing v2.4.5: 1. ensureServiceReachable no longer false-fails on early `exited` status. Coolify's queue worker can take 60-120s to dequeue a `start` request; during that window service.applications[*].status returns the stale `exited` (= "never started") state. Previously the polling loop treated that as terminal failure after 90s, returning started:false on stacks that were about to come up healthy. The new logic requires "evidence of activity" (status starting:* or running:* seen at least once) before treating subsequent `exited` reports as terminal. Until activity is observed, the loop just keeps polling up to the 8-min health timeout. 2. apps.repair (new tool). Re-runs the three post-deploy patches (env rewrite, traefik port label, coolify-proxy network attach + force-recreate + proxy restart) against an existing service without recreating it. Useful when: - apps.create returned started:false but containers eventually came up (now the polling fix should make this rare) - a deploy succeeded mechanically but is serving Traefik 503 or Mixed Content - a user rotates a custom domain on an existing app Params: { uuid, fqdn, publicAppName, port? } Returns: { reachable, postDeploy: { steps }, probe } Version bumped to 2.4.6. Made-with: Cursor --- app/api/mcp/route.ts | 102 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 7 deletions(-) diff --git a/app/api/mcp/route.ts b/app/api/mcp/route.ts index 79c2935c..426c2767 100644 --- a/app/api/mcp/route.ts +++ b/app/api/mcp/route.ts @@ -92,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com'; export async function GET() { return NextResponse.json({ name: 'vibn-mcp', - version: '2.4.5', + version: '2.4.6', authentication: { scheme: 'Bearer', tokenPrefix: 'vibn_sk_', @@ -124,6 +124,7 @@ export async function GET() { 'apps.volumes.wipe', 'apps.containers.up', 'apps.containers.ps', + 'apps.repair', 'apps.templates.list', 'apps.templates.search', 'apps.envs.list', @@ -230,6 +231,8 @@ export async function POST(request: Request) { return await toolAppsContainersUp(principal, params); case 'apps.containers.ps': return await toolAppsContainersPs(principal, params); + case 'apps.repair': + return await toolAppsRepair(principal, params); case 'apps.templates.list': return await toolAppsTemplatesList(params); case 'apps.templates.search': @@ -1179,6 +1182,78 @@ async function toolAppsContainersPs(principal: Principal, params: Record.loadbalancer. + * server.port label into docker-compose.yml. + * 3. Connect coolify-proxy to the service's project network. + * 4. Force-recreate the public-facing app container. + * 5. Restart coolify-proxy so Traefik re-discovers labels. + * + * Params: + * uuid required — service uuid (the resource, not a single container) + * fqdn required — the public hostname (e.g. "crm.mark.vibnai.com") + * publicAppName required — docker-compose service name of the public app + * (usually equals the template slug: "twenty", "n8n", …) + * port optional — internal port (default: derived per template) + * + * Returns the same { ok, steps } shape as the post-deploy block in + * apps.create plus a final reachability probe. + */ +async function toolAppsRepair(_principal: Principal, params: Record) { + const uuid = String(params.uuid ?? '').trim(); + const fqdn = String(params.fqdn ?? '').trim(); + const publicAppName = String(params.publicAppName ?? '').trim(); + const port = params.port != null ? Number(params.port) : undefined; + if (!uuid || !fqdn || !publicAppName) { + return NextResponse.json( + { error: 'apps.repair requires { uuid, fqdn, publicAppName }' }, + { status: 400 } + ); + } + if (!isCoolifySshConfigured()) { + return NextResponse.json( + { error: 'apps.repair requires SSH to the Coolify host (set COOLIFY_SSH_*)' }, + { status: 501 } + ); + } + const postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port }); + + let reachable = false; + let probeDiag = ''; + try { + const ctrl = new AbortController(); + const t = setTimeout(() => ctrl.abort(), 12_000); + const res = await fetch(`https://${fqdn}`, { signal: ctrl.signal, redirect: 'manual' }); + clearTimeout(t); + reachable = res.status >= 200 && res.status < 400; + probeDiag = `GET https://${fqdn} → ${res.status}`; + } catch (e) { + probeDiag = `probe failed: ${e instanceof Error ? e.message : String(e)}`; + } + + return NextResponse.json({ + result: { + uuid, + fqdn, + publicAppName, + reachable, + postDeploy, + probe: probeDiag, + note: reachable + ? `Repaired and reachable on https://${fqdn}.` + : `Repair steps applied but probe still failed. Check postDeploy.steps for any "ok: false" entries; otherwise wait 30s and retry the probe.`, + }, + }); +} + // ────────────────────────────────────────────────── // apps.templates.* — Coolify one-click catalog browse // ────────────────────────────────────────────────── @@ -1311,8 +1386,15 @@ async function ensureServiceReachable(opts: { // running:healthy. This field is truthful, unlike service.status // which routinely lies as "starting:unknown" while containers are // actually healthy. + // Coolify's queue worker can take 60-120s to dequeue a start + // request, during which time service.applications[*].status still + // reports the stale `exited` state (= "never started"). We only + // treat `exited` as terminal AFTER we've seen evidence of activity + // (`starting:*` or `running:*`) — otherwise it's just queue lag. const startedAt = Date.now(); let appStatus = 'unknown'; + let sawActivity = false; + let lastExitObservedAt = 0; while (Date.now() - startedAt < healthTimeoutMs) { try { const svc = (await getService(uuid)) as unknown as { @@ -1322,12 +1404,18 @@ async function ensureServiceReachable(opts: { const target = apps.find(a => a.name === publicAppName) ?? apps[0]; appStatus = target?.status ?? 'unknown'; if (/^running:healthy/i.test(appStatus)) break; - // Failure modes Coolify reports as terminal: exited (compose - // never ran), restarting (boot loop). We don't want to wait - // the full timeout in those cases. - if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) { - // Give it 90s to transition out of "exited" before declaring failure - break; + if (/^starting|^running/i.test(appStatus)) { + sawActivity = true; + lastExitObservedAt = 0; + } + // Once we've seen activity, an exited status is terminal — + // boot loop or compose failure. Wait 30s of consecutive + // `exited` to be sure it's not a Compose recreate cycle. + if (sawActivity && /^exited/i.test(appStatus)) { + if (lastExitObservedAt === 0) lastExitObservedAt = Date.now(); + if (Date.now() - lastExitObservedAt > 30_000) break; + } else if (!/^exited/i.test(appStatus)) { + lastExitObservedAt = 0; } } catch (e) { console.warn('[ensureServiceReachable] status probe failed', e);