From 3d525afdf728467e58257c74efb107064476892c Mon Sep 17 00:00:00 2001 From: Mark Henderson Date: Wed, 29 Apr 2026 20:27:52 -0700 Subject: [PATCH] fix(ai): stop the AI from forking duplicate services to escape errors Three changes that compound to fix the "4 orphan twenty-* services" problem we just hit: 1. apps_create is now idempotent within a project. If a service from the same template already exists in the same Vibn projectId, return it with alreadyExisted: true instead of creating a clone. Pass { force: true } to opt out for legitimate dev/staging duplicates. 2. New apps_unstick tool. SSH-cleans orphan Docker containers matching the resource UUID so a deploy that hit "Conflict. The container name X is already in use" can recover without deleting the entire service. 3. System prompt hardened with two new hard rules: - ALWAYS apps_list before apps_create (idempotency in spirit, not just at the API boundary) - NEVER delete-and-recreate a service to escape an error. The recovery for container conflicts is apps_unstick + apps_deploy. Already cleaned the 3 duplicate twenty-* services from prod (kept twenty-live, freshest healthy). Frees ~9 GB RAM on the host. Made-with: Cursor --- app/api/chat/route.ts | 3 + app/api/mcp/route.ts | 148 ++++++++++++++++++++++++++++++++++++++++++ lib/ai/vibn-tools.ts | 21 ++++++ 3 files changed, 172 insertions(+) diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts index 251d72e2..42e8eb72 100644 --- a/app/api/chat/route.ts +++ b/app/api/chat/route.ts @@ -233,7 +233,10 @@ When you write to Plan, the user does NOT need a long acknowledgment. "Logged th ## Hard rules (non-negotiable) - ALWAYS pass \`projectId\` to \`apps_create\` and \`databases_create\`. If the user didn't say which project, infer from context (active project, last-mentioned, only one in workspace) — only ask if genuinely ambiguous. +- ALWAYS call \`apps_list { projectId }\` BEFORE \`apps_create\` to check if the thing already exists. \`apps_create\` is idempotent within a project (returns \`alreadyExisted: true\` for duplicate templates), but you should check first so the user sees you being thoughtful — not "deploy stuff and hope." - ALWAYS call \`apps_templates_search\` BEFORE \`apps_create\` when the user names a known third-party app. Hand-rolling a Dockerfile when a maintained template exists is how supply-chain bugs ship. +- **NEVER delete-and-recreate a service to escape an error.** When a deploy fails with "Conflict. The container name … is already in use" or any orphan-container symptom, the recovery is: \`apps_unstick { uuid }\` → \`apps_deploy { uuid }\`. Deleting the service to side-step the conflict creates a new uuid with new container names AND leaves the orphan running AND forks a duplicate stack. We've shipped 4 orphan twenty-* services this way before. Don't repeat it. +- **If a deploy fails twice in a row with the same error, STOP.** Don't loop. Surface the error and the two recovery attempts you've already tried, and ask the user how to proceed. - Destructive ops (\`*_delete\`, \`*_volumes_wipe\`) require \`confirm\` equal to the resource's exact name. Always fetch the name first with a \`*_get\` call. Confirm with the user before executing irreversible deletes unless they explicitly said "delete X". - Long-running ops (deploys, DNS provisioning, db provisioning) take 1–5 min. Tell the user up front so they don't think you're stuck. Don't poll in a tight loop — it wastes tool rounds. - After a \`ship\` or \`apps.deploy\`, the result is authoritative. Don't call gitea_*, shell_exec, or apps_* to "verify" — read the response and report. diff --git a/app/api/mcp/route.ts b/app/api/mcp/route.ts index 51a95595..e4afcabb 100644 --- a/app/api/mcp/route.ts +++ b/app/api/mcp/route.ts @@ -293,6 +293,8 @@ export async function POST(request: Request) { return await toolAppsContainersPs(principal, params); case 'apps.repair': return await toolAppsRepair(principal, params); + case 'apps.unstick': + return await toolAppsUnstick(principal, params); case 'apps.templates.list': return await toolAppsTemplatesList(params); case 'apps.templates.search': @@ -1298,6 +1300,34 @@ async function toolAppsCreate(principal: Principal, params: Record) }, { status: 404 }); } + // ── Idempotency: don't fan out duplicate services into the same + // Coolify project. If a service with the same template already + // exists, return it instead of creating a 4th twenty-* clone. + // Use force=true to bypass dedup only when the caller really wants + // multiple instances (e.g. dev/staging copies of the same template). + const force = params.force === true || params.force === 'true'; + if (params.projectId && !force) { + const existing = await findExistingTemplateService( + targetCoolifyProjectUuid, + templateSlug, + ); + if (existing) { + await linkIfRequested(existing.uuid, 'service'); + return NextResponse.json({ + result: { + uuid: existing.uuid, + name: existing.name, + template: templateSlug, + alreadyExisted: true, + summaryHint: + `A "${templateSlug}" service already exists in this project as "${existing.name}" (uuid ${existing.uuid}). Returning it instead of creating a duplicate. ` + + `If the user wanted a SECOND independent instance, re-call apps_create with { force: true }. ` + + `If the existing one is broken, call apps_unstick { uuid } and then apps_deploy { uuid } — DO NOT delete-and-recreate.`, + }, + }); + } + } + const appName = slugify(String(params.name ?? templateSlug)); const fqdn = resolveFqdn(params.domain, ws.slug, appName); if (fqdn instanceof NextResponse) return fqdn; @@ -3858,3 +3888,121 @@ async function toolPlanDecisionLog(principal: Principal, params: Record { + try { + const services = await listServicesInProject(coolifyProjectUuid); + for (const s of services) { + // Coolify stores the template slug as `service_type` on the row. + // Some older services may have it under `type`. Match either. + const type = + (s as any).service_type || + (s as any).type || + (typeof (s as any).docker_compose_raw === 'string' && + (s as any).docker_compose_raw.includes(templateSlug) + ? templateSlug + : null); + if (type === templateSlug) { + return { uuid: s.uuid, name: s.name }; + } + } + } catch (e) { + console.warn('[findExistingTemplateService] failed', e); + } + return null; +} + +/** + * apps.unstick — recover a service stuck on a "container name already + * in use" Docker conflict. Force-removes the orphan containers (and + * optionally their volumes), then returns. Caller should then re-call + * apps.deploy to bring the stack back up. + * + * This is the RIGHT recovery path. The WRONG one (and what the AI was + * doing before the system-prompt update) is to delete the service and + * recreate a new one with a fresh UUID — which side-steps the conflict + * by creating new container names but leaves the orphan running and + * forks a duplicate copy of the stack. + */ +async function toolAppsUnstick(principal: Principal, params: Record) { + const uuid = String(params.uuid ?? '').trim(); + if (!uuid) return NextResponse.json({ error: 'uuid required' }, { status: 400 }); + if (!isCoolifySshConfigured()) { + return NextResponse.json({ + error: + 'Coolify SSH is not configured on this deploy. Cannot reach the host to clean orphan containers.', + }, { status: 503 }); + } + + // Resolve the resource to confirm tenancy + grab its name. + let resourceName = ''; + let kind: 'application' | 'service' | 'database' = 'application'; + try { + const app = await getApplicationInWorkspace(uuid, principal.workspace); + if (app) { resourceName = app.name; kind = 'application'; } + } catch {} + if (!resourceName) { + try { + const svc = await getServiceInWorkspace(uuid, principal.workspace); + if (svc) { resourceName = svc.name; kind = 'service'; } + } catch {} + } + if (!resourceName) { + try { + const db = await getDatabaseInWorkspace(uuid, principal.workspace); + if (db) { resourceName = db.name; kind = 'database'; } + } catch {} + } + if (!resourceName) { + return NextResponse.json({ error: 'Resource not found in this workspace' }, { status: 404 }); + } + + const wipeVolumes = params.wipeVolumes === true || params.wipeVolumes === 'true'; + // All Coolify-managed containers for a resource carry its UUID as a + // suffix on the container name (e.g. postgres-, twenty-, + // worker-). One docker rm -f against any name ending in - + // catches every container in the stack. + const filter = `name=-${uuid}$`; + const cmd = wipeVolumes + ? `docker ps -a --filter '${filter}' -q | xargs -r docker rm -f -v` + : `docker ps -a --filter '${filter}' -q | xargs -r docker rm -f`; + + let removed: string[] = []; + let stderr = ''; + try { + const result = await runOnCoolifyHost( + `docker ps -a --filter '${filter}' --format '{{.Names}}' | tee /tmp/unstick-${uuid}.txt; ` + cmd + ); + removed = (result.stdout || '').split('\n').filter(Boolean).filter((l) => l.includes(`-${uuid}`)); + stderr = result.stderr || ''; + } catch (e) { + return NextResponse.json({ + error: `Failed to clean orphan containers: ${e instanceof Error ? e.message : String(e)}`, + }, { status: 500 }); + } + + return NextResponse.json({ + result: { + uuid, + name: resourceName, + kind, + removedContainers: removed, + wipeVolumes, + stderr: stderr || undefined, + summaryHint: + removed.length === 0 + ? `No orphan containers found for ${resourceName} (uuid ${uuid}). The conflict may be elsewhere — check apps_logs.` + : `Cleaned ${removed.length} orphan container(s) for ${resourceName}: ${removed.join(', ')}. Now call apps_deploy { uuid: "${uuid}" } to bring the stack back up. Do NOT delete the service.`, + }, + }); +} diff --git a/lib/ai/vibn-tools.ts b/lib/ai/vibn-tools.ts index 896c1ee9..7eebc045 100644 --- a/lib/ai/vibn-tools.ts +++ b/lib/ai/vibn-tools.ts @@ -244,6 +244,27 @@ Auto-domain {name}.{workspace}.vibnai.com is assigned automatically.`, required: ['uuid', 'fqdn', 'publicAppName'], }, }, + { + name: 'apps_unstick', + description: `Recover a service stuck on a Docker "container name already in use" conflict. Force-removes orphan containers (everything matching name suffix -) so the next apps_deploy can boot clean. + +USE THIS — DO NOT delete-and-recreate the service. Deleting and re-creating produces a NEW uuid + NEW container names, which side-steps the conflict but leaves the orphan running AND forks a duplicate copy of the stack. We've burned ourselves on this before (4 orphan twenty-* services, 12GB RAM eaten). + +Recipe when a deploy fails with "Conflict. The container name X is already in use": + 1. apps_unstick { uuid: "" } + 2. apps_deploy { uuid: "" } + 3. apps_get { uuid: "" } to confirm fqdn / status. + +Pass wipeVolumes: true ONLY if the user explicitly said "nuke the data".`, + parameters: { + type: 'OBJECT', + properties: { + uuid: { type: 'STRING', description: 'The Coolify service / app / database UUID.' }, + wipeVolumes: { type: 'BOOLEAN', description: 'If true, also remove anonymous volumes (data loss). Default false.' }, + }, + required: ['uuid'], + }, + }, { name: 'apps_templates_list', description: 'Browse the Coolify one-click template catalog (320+ apps: CRMs, AI tools, CMSes, dashboards, databases). Each is deployable via apps_create with { template: slug }.',