fix(mcp v2.4.6): poll robustness + apps.repair recovery tool
Two fixes for transient Coolify queue lag observed when smoke-testing
v2.4.5:
1. ensureServiceReachable no longer false-fails on early `exited` status.
Coolify's queue worker can take 60-120s to dequeue a `start` request;
during that window service.applications[*].status returns the stale
`exited` (= "never started") state. Previously the polling loop
treated that as terminal failure after 90s, returning started:false
on stacks that were about to come up healthy.
The new logic requires "evidence of activity" (status starting:* or
running:* seen at least once) before treating subsequent `exited`
reports as terminal. Until activity is observed, the loop just keeps
polling up to the 8-min health timeout.
2. apps.repair (new tool). Re-runs the three post-deploy patches
(env rewrite, traefik port label, coolify-proxy network attach +
force-recreate + proxy restart) against an existing service without
recreating it. Useful when:
- apps.create returned started:false but containers eventually
came up (now the polling fix should make this rare)
- a deploy succeeded mechanically but is serving Traefik 503 or
Mixed Content
- a user rotates a custom domain on an existing app
Params: { uuid, fqdn, publicAppName, port? }
Returns: { reachable, postDeploy: { steps }, probe }
Version bumped to 2.4.6.
Made-with: Cursor
This commit is contained in:
@@ -92,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
|
||||
export async function GET() {
|
||||
return NextResponse.json({
|
||||
name: 'vibn-mcp',
|
||||
version: '2.4.5',
|
||||
version: '2.4.6',
|
||||
authentication: {
|
||||
scheme: 'Bearer',
|
||||
tokenPrefix: 'vibn_sk_',
|
||||
@@ -124,6 +124,7 @@ export async function GET() {
|
||||
'apps.volumes.wipe',
|
||||
'apps.containers.up',
|
||||
'apps.containers.ps',
|
||||
'apps.repair',
|
||||
'apps.templates.list',
|
||||
'apps.templates.search',
|
||||
'apps.envs.list',
|
||||
@@ -230,6 +231,8 @@ export async function POST(request: Request) {
|
||||
return await toolAppsContainersUp(principal, params);
|
||||
case 'apps.containers.ps':
|
||||
return await toolAppsContainersPs(principal, params);
|
||||
case 'apps.repair':
|
||||
return await toolAppsRepair(principal, params);
|
||||
case 'apps.templates.list':
|
||||
return await toolAppsTemplatesList(params);
|
||||
case 'apps.templates.search':
|
||||
@@ -1179,6 +1182,78 @@ async function toolAppsContainersPs(principal: Principal, params: Record<string,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* apps.repair — re-run post-deploy patches against an existing service.
|
||||
*
|
||||
* Use this when a service is running but unreachable on its custom
|
||||
* domain (typical Traefik 503 / Mixed Content symptoms). It applies
|
||||
* the same three fixes apps.create runs on a fresh deploy:
|
||||
*
|
||||
* 1. Rewrite SERVICE_FQDN_* / SERVICE_URL_* in the service .env so
|
||||
* Coolify regen no longer overwrites them with sslip.io defaults.
|
||||
* 2. Inject the missing traefik.http.services.<svc>.loadbalancer.
|
||||
* server.port label into docker-compose.yml.
|
||||
* 3. Connect coolify-proxy to the service's project network.
|
||||
* 4. Force-recreate the public-facing app container.
|
||||
* 5. Restart coolify-proxy so Traefik re-discovers labels.
|
||||
*
|
||||
* Params:
|
||||
* uuid required — service uuid (the resource, not a single container)
|
||||
* fqdn required — the public hostname (e.g. "crm.mark.vibnai.com")
|
||||
* publicAppName required — docker-compose service name of the public app
|
||||
* (usually equals the template slug: "twenty", "n8n", …)
|
||||
* port optional — internal port (default: derived per template)
|
||||
*
|
||||
* Returns the same { ok, steps } shape as the post-deploy block in
|
||||
* apps.create plus a final reachability probe.
|
||||
*/
|
||||
async function toolAppsRepair(_principal: Principal, params: Record<string, any>) {
|
||||
const uuid = String(params.uuid ?? '').trim();
|
||||
const fqdn = String(params.fqdn ?? '').trim();
|
||||
const publicAppName = String(params.publicAppName ?? '').trim();
|
||||
const port = params.port != null ? Number(params.port) : undefined;
|
||||
if (!uuid || !fqdn || !publicAppName) {
|
||||
return NextResponse.json(
|
||||
{ error: 'apps.repair requires { uuid, fqdn, publicAppName }' },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
if (!isCoolifySshConfigured()) {
|
||||
return NextResponse.json(
|
||||
{ error: 'apps.repair requires SSH to the Coolify host (set COOLIFY_SSH_*)' },
|
||||
{ status: 501 }
|
||||
);
|
||||
}
|
||||
const postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port });
|
||||
|
||||
let reachable = false;
|
||||
let probeDiag = '';
|
||||
try {
|
||||
const ctrl = new AbortController();
|
||||
const t = setTimeout(() => ctrl.abort(), 12_000);
|
||||
const res = await fetch(`https://${fqdn}`, { signal: ctrl.signal, redirect: 'manual' });
|
||||
clearTimeout(t);
|
||||
reachable = res.status >= 200 && res.status < 400;
|
||||
probeDiag = `GET https://${fqdn} → ${res.status}`;
|
||||
} catch (e) {
|
||||
probeDiag = `probe failed: ${e instanceof Error ? e.message : String(e)}`;
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
result: {
|
||||
uuid,
|
||||
fqdn,
|
||||
publicAppName,
|
||||
reachable,
|
||||
postDeploy,
|
||||
probe: probeDiag,
|
||||
note: reachable
|
||||
? `Repaired and reachable on https://${fqdn}.`
|
||||
: `Repair steps applied but probe still failed. Check postDeploy.steps for any "ok: false" entries; otherwise wait 30s and retry the probe.`,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────
|
||||
// apps.templates.* — Coolify one-click catalog browse
|
||||
// ──────────────────────────────────────────────────
|
||||
@@ -1311,8 +1386,15 @@ async function ensureServiceReachable(opts: {
|
||||
// running:healthy. This field is truthful, unlike service.status
|
||||
// which routinely lies as "starting:unknown" while containers are
|
||||
// actually healthy.
|
||||
// Coolify's queue worker can take 60-120s to dequeue a start
|
||||
// request, during which time service.applications[*].status still
|
||||
// reports the stale `exited` state (= "never started"). We only
|
||||
// treat `exited` as terminal AFTER we've seen evidence of activity
|
||||
// (`starting:*` or `running:*`) — otherwise it's just queue lag.
|
||||
const startedAt = Date.now();
|
||||
let appStatus = 'unknown';
|
||||
let sawActivity = false;
|
||||
let lastExitObservedAt = 0;
|
||||
while (Date.now() - startedAt < healthTimeoutMs) {
|
||||
try {
|
||||
const svc = (await getService(uuid)) as unknown as {
|
||||
@@ -1322,12 +1404,18 @@ async function ensureServiceReachable(opts: {
|
||||
const target = apps.find(a => a.name === publicAppName) ?? apps[0];
|
||||
appStatus = target?.status ?? 'unknown';
|
||||
if (/^running:healthy/i.test(appStatus)) break;
|
||||
// Failure modes Coolify reports as terminal: exited (compose
|
||||
// never ran), restarting (boot loop). We don't want to wait
|
||||
// the full timeout in those cases.
|
||||
if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) {
|
||||
// Give it 90s to transition out of "exited" before declaring failure
|
||||
break;
|
||||
if (/^starting|^running/i.test(appStatus)) {
|
||||
sawActivity = true;
|
||||
lastExitObservedAt = 0;
|
||||
}
|
||||
// Once we've seen activity, an exited status is terminal —
|
||||
// boot loop or compose failure. Wait 30s of consecutive
|
||||
// `exited` to be sure it's not a Compose recreate cycle.
|
||||
if (sawActivity && /^exited/i.test(appStatus)) {
|
||||
if (lastExitObservedAt === 0) lastExitObservedAt = Date.now();
|
||||
if (Date.now() - lastExitObservedAt > 30_000) break;
|
||||
} else if (!/^exited/i.test(appStatus)) {
|
||||
lastExitObservedAt = 0;
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('[ensureServiceReachable] status probe failed', e);
|
||||
|
||||
Reference in New Issue
Block a user