diff --git a/app/api/mcp/route.ts b/app/api/mcp/route.ts index 96f1a699..79c2935c 100644 --- a/app/api/mcp/route.ts +++ b/app/api/mcp/route.ts @@ -32,7 +32,8 @@ import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh'; import { composeUp, composePs, - attachToCoolifyProxyNetwork, + applyCoolifyPostDeployFixes, + type CoolifyPostDeployResult, type ResourceKind, } from '@/lib/coolify-compose'; import { listContainersForApp } from '@/lib/coolify-containers'; @@ -91,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com'; export async function GET() { return NextResponse.json({ name: 'vibn-mcp', - version: '2.4.4', + version: '2.4.5', authentication: { scheme: 'Bearer', tokenPrefix: 'vibn_sk_', @@ -879,10 +880,16 @@ async function toolAppsCreate(principal: Principal, params: Record) } let started = false; - let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed'; + let reachable = false; + let appStatus = 'unknown'; + let postDeploy: CoolifyPostDeployResult | null = null; let startDiag = ''; if (params.instantDeploy !== false) { - ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid)); + ({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({ + uuid: created.uuid, + fqdn, + publicAppName: templateSlug, + })); } return NextResponse.json({ @@ -895,11 +902,15 @@ async function toolAppsCreate(principal: Principal, params: Record) template: templateSlug, urlsApplied, started, - startMethod, + reachable, + appStatus, + ...(postDeploy ? { postDeploy } : {}), ...(startDiag ? { startDiag } : {}), - note: started - ? 'Primary containers are up. First boot may take 1-5 min while images finish pulling and migrations run; use apps.logs to monitor. If startDiag mentions a sidecar dependency timeout (workers, schedulers), call apps.containers.up again once the primary is healthy to bring those up.' - : 'Service created but no containers started. Call apps.containers.up to retry; check apps.containers.ps and apps.logs to diagnose.', + note: reachable + ? `Reachable on https://${fqdn}. First boot may continue migrations in the background — check apps.logs if any feature seems missing.` + : started + ? `Containers are healthy but https://${fqdn} did not return 2xx/3xx yet. Wait 30-60s for Traefik to fully discover labels, then retry. If still failing, inspect postDeploy.steps for which fix didn't apply, then call apps.logs and apps.containers.ps.` + : `Public app did not become healthy. Use apps.containers.ps and apps.logs to diagnose. Most common cause: image pull is still in progress (first deploy can take 5-10 min for large images like twentycrm/twenty).`, }, }); } @@ -956,11 +967,23 @@ async function toolAppsCreate(principal: Principal, params: Record) } } + // composeRaw is user-supplied — we can't reliably guess the public + // app name (the user may have any compose service layout). Best + // effort: use the app name as the public app name, which works for + // single-container composes. let started = false; - let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed'; + let reachable = false; + let appStatus = 'unknown'; + let postDeploy: CoolifyPostDeployResult | null = null; let startDiag = ''; if (params.instantDeploy !== false) { - ({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid)); + const publicAppName = String(params.publicAppName ?? appName); + ({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({ + uuid: created.uuid, + fqdn, + publicAppName, + port: params.port ? Number(params.port) : undefined, + })); } return NextResponse.json({ @@ -971,9 +994,13 @@ async function toolAppsCreate(principal: Principal, params: Record) url: `https://${fqdn}`, resourceType: 'service', started, - startMethod, + reachable, + appStatus, + ...(postDeploy ? { postDeploy } : {}), ...(startDiag ? { startDiag } : {}), - note: 'Domain routing for compose services must be configured after initial startup — set SERVER_URL env to the desired URL, then call apps.containers.up to apply.', + note: reachable + ? `Reachable on https://${fqdn}.` + : `Domain routing for custom compose services depends on knowing which docker-compose service is the public-facing one. Pass publicAppName= and port= on apps.create to enable post-deploy patching, or set them manually.`, }, }); } @@ -1223,106 +1250,138 @@ async function toolAppsTemplatesSearch(params: Record) { } /** - * Ensure a Coolify Service is actually running (containers exist and - * are healthy/starting), with a fallback path for Coolify's flaky - * queued-start worker. + * Bring a Coolify Service to a publicly-reachable state. * - * Strategy: - * 1. Call POST /services/{uuid}/start so Coolify's records show - * "starting" and any internal hooks fire. - * 2. Wait briefly, then probe the host for any container belonging - * to this service via `docker ps --filter name={uuid}`. - * 3. If no containers materialised, run `docker compose up -d` - * directly via SSH against the rendered compose dir. This is - * the same command Coolify's worker would run; we just bypass - * the unreliable queue. + * v2.4.5 architecture + * -------------------- + * Earlier versions ran `docker compose up -d` over SSH as a fallback + * when Coolify's queue stalled. That worked for "containers running" + * but caused two cascading bugs because it bypassed Coolify's full + * deploy pipeline: + * - Internal services (Postgres, Redis) ended up on the shared + * `coolify` Docker network, where DNS aliases for `postgres`/ + * `redis` collide with Coolify's own `coolify-db`/`coolify-redis` + * containers — Twenty's `postgres://postgres:5432/twenty-db` + * resolves to the wrong DB and fails auth. + * - The proxy-network attach we did in our SSH path attached EVERY + * container, magnifying the same DNS collision. * - * Returns: - * started true if at least one container is running for this service - * startMethod which path got us there - * diag human-readable note for failures (truncated stderr) + * The right model is: let Coolify's queue do the heavy lifting (it + * handles compose generation, volumes, internal networking, env-var + * substitution, healthchecks, etc.) and patch the three things its + * REST API does NOT expose: + * 1. SERVICE_FQDN_* / SERVICE_URL_* env vars in the rendered .env + * 2. The missing traefik loadbalancer.server.port label + * 3. coolify-proxy → project network attachment + Traefik nudge + * + * Steps: + * 1. POST /services/{uuid}/start — Coolify's queue does its thing. + * 2. Poll service.applications[*].status (the per-application + * status is truthful; service.status is not). Wait until the + * public app reports running:healthy or we time out. + * 3. apply post-deploy fixes: rewrite .env, inject port label, + * attach proxy to project net, recreate ONLY the public app, + * restart proxy so Traefik re-discovers. + * 4. (Optional) probe https:// for a 200/301/302 to confirm + * end-to-end reachability. */ -async function ensureServiceUp(uuid: string): Promise<{ +async function ensureServiceReachable(opts: { + uuid: string; + fqdn: string; + publicAppName: string; + port?: number; + /** Max wall-clock time to wait for Coolify to bring containers healthy. */ + healthTimeoutMs?: number; +}): Promise<{ started: boolean; - startMethod: 'coolify-queue' | 'compose-up' | 'failed'; + reachable: boolean; + appStatus: string; + postDeploy: CoolifyPostDeployResult | null; diag: string; }> { - // 1. Ask Coolify nicely + const { uuid, fqdn, publicAppName, port, healthTimeoutMs = 8 * 60_000 } = opts; + try { await startService(uuid); } catch (e) { - console.warn('[ensureServiceUp] startService failed (will fall back)', e); + console.warn('[ensureServiceReachable] startService failed', e); } - // 2. Probe — has the queue actually started anything? - if (!isCoolifySshConfigured()) { - return { started: true, startMethod: 'coolify-queue', diag: '' }; - } - // Allow up to ~12s for the worker to wake up; checking every 3s. - for (let i = 0; i < 4; i++) { - await new Promise(r => setTimeout(r, 3_000)); + // Poll service.applications[*].status until the public app is + // running:healthy. This field is truthful, unlike service.status + // which routinely lies as "starting:unknown" while containers are + // actually healthy. + const startedAt = Date.now(); + let appStatus = 'unknown'; + while (Date.now() - startedAt < healthTimeoutMs) { try { - const probe = await runOnCoolifyHost( - `docker ps --filter name=${uuid} --format '{{.Names}}'`, - { timeoutMs: 8_000 }, - ); - if (probe.stdout.trim().length > 0) { - // Coolify started the stack. Even on this happy path we still - // need to ensure the proxy-network attachment ran, since - // Coolify only attaches at the end of its full deploy - // pipeline (which can be skipped if a sidecar fails to come - // up). Idempotent — already-attached containers are no-ops. - await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ }); - return { started: true, startMethod: 'coolify-queue', diag: '' }; + const svc = (await getService(uuid)) as unknown as { + applications?: Array<{ name?: string; status?: string }>; + }; + const apps = svc.applications ?? []; + const target = apps.find(a => a.name === publicAppName) ?? apps[0]; + appStatus = target?.status ?? 'unknown'; + if (/^running:healthy/i.test(appStatus)) break; + // Failure modes Coolify reports as terminal: exited (compose + // never ran), restarting (boot loop). We don't want to wait + // the full timeout in those cases. + if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) { + // Give it 90s to transition out of "exited" before declaring failure + break; } } catch (e) { - console.warn('[ensureServiceUp] probe failed', e); + console.warn('[ensureServiceReachable] status probe failed', e); + } + await new Promise(r => setTimeout(r, 8_000)); + } + + const started = /^running/i.test(appStatus); + if (!started) { + return { + started: false, + reachable: false, + appStatus, + postDeploy: null, + diag: `Public app "${publicAppName}" did not become healthy within ${Math.round(healthTimeoutMs/1000)}s (status=${appStatus}). Use apps.containers.ps and apps.logs to diagnose.`, + }; + } + + // Apply post-deploy fixes. Only meaningful when SSH is configured — + // without it we can't rewrite the .env or attach proxy networks. + let postDeploy: CoolifyPostDeployResult | null = null; + if (isCoolifySshConfigured()) { + try { + postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port }); + } catch (e) { + console.warn('[ensureServiceReachable] post-deploy fix failed', e); } } - // 3. Fallback — run docker compose up -d ourselves - let composeDiag = ''; + // Best-effort reachability probe. Public DNS for the workspace + // wildcard may not have propagated yet (esp. on first deploy in a + // brand-new workspace), so a non-200 here doesn't mean failure — + // it just means "agents should retry the URL in a few seconds". + let reachable = false; + let probeDiag = ''; try { - const r = await composeUp('service', uuid, { timeoutMs: 600_000 }); - // Strip ANSI / control chars (compose progress output uses \r and - // ANSI escapes) so the diag survives JSON serialization cleanly. - composeDiag = (r.stderr || r.stdout) - .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') - .replace(/[\x00-\x08\x0B-\x1F]/g, '') - .trim() - .slice(-400); - if (r.code === 0) { - return { started: true, startMethod: 'compose-up', diag: '' }; - } - // Non-zero exit DOES NOT mean nothing started. Compose returns - // non-zero whenever any service hits a `depends_on: - // condition: service_healthy` timeout — common for sidecar - // containers (workers, schedulers) of apps with slow-booting - // primary services (Twenty's worker waits on twenty's healthcheck, - // which takes 2-5 min). Probe the host to see what's actually - // running before declaring failure. + const url = `https://${fqdn}`; + const ctrl = new AbortController(); + const t = setTimeout(() => ctrl.abort(), 12_000); + const res = await fetch(url, { signal: ctrl.signal, redirect: 'manual' }); + clearTimeout(t); + reachable = res.status >= 200 && res.status < 400; + probeDiag = `GET ${url} → ${res.status}`; } catch (e) { - composeDiag = e instanceof Error ? e.message : String(e); + probeDiag = `GET probe failed: ${e instanceof Error ? e.message : String(e)}`; } - try { - const probe = await runOnCoolifyHost( - `docker ps --filter name=${uuid} --format '{{.Names}}'`, - { timeoutMs: 8_000 }, - ); - if (probe.stdout.trim().length > 0) { - // Something IS running — partial success. Surface the diag so - // agents see WHY compose returned non-zero (usually a sidecar - // depends_on timeout) but report started=true so happy-path - // workflows don't panic. composeUp already attached the proxy - // network, but call once more to cover any container that came - // up after the initial attach pass. - await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ }); - return { started: true, startMethod: 'compose-up', diag: composeDiag }; - } - } catch { /* fall through */ } - - return { started: false, startMethod: 'failed', diag: composeDiag }; + return { + started: true, + reachable, + appStatus, + postDeploy, + diag: probeDiag, + }; } /** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */ diff --git a/lib/coolify-compose.ts b/lib/coolify-compose.ts index 99f922fa..152a3339 100644 --- a/lib/coolify-compose.ts +++ b/lib/coolify-compose.ts @@ -1,31 +1,45 @@ /** - * Bring a Coolify Service or compose Application up via raw - * `docker compose up -d`. + * Surgical post-deploy fixes for Coolify-managed Services. * * Why this exists * --------------- - * Coolify's `POST /services/{uuid}/start` and `POST /deploy` endpoints - * write the rendered docker-compose.yml + .env to - * `/data/coolify/services/{uuid}/` (or `applications/{uuid}/` for - * compose apps), then enqueue a Laravel job to run - * `docker compose up -d`. In practice that worker queue is unreliable: - * it routinely returns "Service starting request queued" and then - * never actually invokes docker compose. The user's stack just sits - * there with rendered files and no containers. + * Coolify's service-template deploy pipeline gets us 90% of the way: + * it generates a docker-compose.yml + .env, runs `docker compose up`, + * sets up volumes, and writes Traefik labels. But for many templates + * (including the popular twenty/n8n/ghost/etc.) it consistently fails + * to do three host-level things that the public REST API does NOT + * expose: * - * For a hands-off SaaS we can't ship that experience. This helper - * does the work directly via SSH, so a single MCP `apps.create` call - * really does leave a running app. + * 1. Rewrite the auto-generated `SERVICE_FQDN_*` / `SERVICE_URL_*` + * env vars from sslip.io defaults to the user's real FQDN. The + * user's domain is correctly stored on `service.applications[].fqdn` + * (so Traefik routing rules use it), but the env vars that the + * app embeds into its frontend bundle (e.g. Twenty's SERVER_URL) + * keep pointing at sslip.io. Result: SPA loads on real HTTPS + * then makes XHRs to insecure sslip.io URLs → "Mixed Content" + * errors and the app appears broken. + * + * 2. Generate the `traefik.http.services..loadbalancer.server.port` + * label. Without it Traefik logs `error: port is missing` and + * returns 503 on every request. + * + * 3. Connect `coolify-proxy` to the resource's project network. + * Coolify generates a label `caddy_ingress_network=` + * hinting that the proxy SHOULD live there, but never actually + * runs `docker network connect`. Result: even if Traefik + * discovers the right routing rules, it can't reach the upstream + * container. + * + * This module fixes all three after Coolify's queue finishes its work. * * Permissions model * ----------------- - * The `vibn-logs` SSH user (created by deploy/setup-coolify-ssh.sh) - * is in the `docker` group but has no shell sudo. It also can't read - * `/data/coolify/services/` directly because Coolify chmods that to - * 700 root. We work around both constraints by running the docker - * CLI inside a one-shot container that bind-mounts the path. The - * docker daemon runs as root so it can read the directory; the - * `vibn-logs` user only needs `docker` socket access. + * The `vibn-logs` SSH user has docker-group membership but no shell + * sudo and no read access to `/data/coolify/services//` (Coolify + * chmods that to 0700 root). We work around both by running a one-shot + * `python:alpine` container that bind-mounts the path. The docker + * daemon runs as root so it can read the directory; vibn-logs only + * needs the docker socket. */ import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh'; @@ -34,7 +48,6 @@ import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh'; export type ResourceKind = 'service' | 'application'; function composeDir(kind: ResourceKind, uuid: string): string { - // Coolify v4 path layout — these are stable across the v4 line. return kind === 'service' ? `/data/coolify/services/${uuid}` : `/data/coolify/applications/${uuid}`; @@ -45,15 +58,18 @@ function sq(s: string): string { return `'${String(s).replace(/'/g, `'\\''`)}'`; } +// ───────────────────────────────────────────────────────────────────── +// Manual recovery helpers (apps.containers.up / .ps) +// ───────────────────────────────────────────────────────────────────── + /** - * Run a `docker compose` subcommand inside the rendered compose - * directory using a one-shot `docker:cli` container. Falls back to - * pulling the image on the first call. + * Run a `docker compose` subcommand inside the rendered compose dir + * via a one-shot `docker:cli` container. Used by `apps.containers.up` + * and `apps.containers.ps` for manual user recovery. * - * The `docker:cli` image (~50MB) is the official Docker CLI without - * the daemon. By bind-mounting the host docker socket it talks to - * the host's daemon, so containers it creates are first-class - * children of the same Docker engine — exactly what we want. + * Note: the *deployment* path (apps.create) no longer uses this + * helper. apps.create lets Coolify's own queue do the deploy, then + * applies the post-deploy fixes via `applyCoolifyPostDeployFixes`. */ async function composeRun( kind: ResourceKind, @@ -62,8 +78,6 @@ async function composeRun( opts: { timeoutMs?: number } = {}, ): Promise { const dir = composeDir(kind, uuid); - // Use --workdir + bind-mount so docker compose finds compose.yml + .env - // automatically. The `--rm` cleans the helper container after each call. const cmd = [ 'docker', 'run', '--rm', '-v', sq(`${dir}:/work`), @@ -76,72 +90,13 @@ async function composeRun( return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 }); } -/** - * `docker compose up -d` for a Coolify service or compose app. - * - * Idempotent — Compose already-running containers are no-op'd. - * Returns the raw SSH result so callers can surface diagnostics on - * failure (most common: image-pull errors, port conflicts). - * - * After compose succeeds we also attach every stack container to the - * `coolify` proxy network. Coolify's UI-driven deploy does this as a - * post-step so Traefik can route public traffic to the container, but - * the rendered compose file only declares the service-private network. - * If we skip this step the stack runs fine on its own bridge but - * `crm.mark.vibnai.com` returns "no available server" from Traefik. - */ +/** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */ export async function composeUp( kind: ResourceKind, uuid: string, opts: { timeoutMs?: number } = {}, ): Promise { - const r = await composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts); - // Best-effort: attach to the proxy network even if compose returned - // non-zero (sidecar `depends_on` timeouts still leave primary - // containers running, and we want them reachable). - await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ }); - return r; -} - -/** - * Attach the public-facing containers of a Coolify resource to the - * `coolify` proxy network so Traefik can reach them. - * - * IMPORTANT: only attach containers that have Traefik labels. The - * coolify network is shared across the whole platform (it hosts - * coolify-db, coolify-redis, etc.) and Docker's embedded DNS resolves - * unqualified hostnames like `postgres` and `redis` to the FIRST - * container with that name on the network. If we attach Twenty's - * `postgres-` container to coolify, Twenty's - * `postgres://postgres:5432/...` connection string starts resolving - * to `coolify-db` instead, which fails auth (different password). - * - * Coolify's own deploy pipeline does the same selective attach — only - * the proxied container goes on the proxy network. Idempotent — - * already-attached containers are no-ops. - */ -export async function attachToCoolifyProxyNetwork( - uuid: string, -): Promise { - // List running containers on the resource's project network with - // their `traefik.enable` label. Only those with `traefik.enable=true` - // need to be reachable by the proxy. - const ls = await runOnCoolifyHost( - `docker ps --filter network=${uuid} --format '{{.Names}}|{{.Label "traefik.enable"}}'`, - { timeoutMs: 10_000 }, - ); - const names = ls.stdout - .split('\n') - .map(s => s.trim()) - .filter(Boolean) - .filter(line => line.endsWith('|true')) - .map(line => line.split('|')[0]); - if (names.length === 0) return; - // Attach each one. `|| true` so already-connected returns 0. - const attaches = names.map(n => - `docker network connect coolify ${sq(n)} 2>/dev/null || true`, - ).join(' && '); - await runOnCoolifyHost(attaches, { timeoutMs: 30_000 }); + return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts); } /** `docker compose down` — stops + removes containers; volumes preserved. */ @@ -153,7 +108,7 @@ export async function composeDown( return composeRun(kind, uuid, ['down'], opts); } -/** `docker compose ps -a` — useful for diagnosing why up didn't yield healthy containers. */ +/** `docker compose ps -a` — exposed as `apps.containers.ps`. */ export async function composePs( kind: ResourceKind, uuid: string, @@ -162,18 +117,275 @@ export async function composePs( } /** - * Verify the rendered compose dir exists before trying to run docker - * compose against it. Returns a friendly null-on-missing instead of - * an opaque ENOENT. + * Verify the rendered compose dir exists. Returns a friendly + * null-on-missing instead of an opaque ENOENT. */ export async function composeDirExists( kind: ResourceKind, uuid: string, ): Promise { - // We can't `ls` the dir directly (perm denied), but a docker bind-mount - // probe will fail-closed if the path is missing. const dir = composeDir(kind, uuid); const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`; const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 }); return r.stdout.trim().endsWith('OK'); } + +// ───────────────────────────────────────────────────────────────────── +// Post-deploy fixes (apps.create's reliability layer) +// ───────────────────────────────────────────────────────────────────── + +export interface CoolifyPostDeployOptions { + /** Coolify service UUID. */ + uuid: string; + /** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */ + fqdn: string; + /** Compose service name of the user-facing app, e.g. "twenty". */ + publicAppName: string; + /** + * HTTP port the public app listens on inside the container. + * If omitted, we try to detect it from `.env` (looking for + * `SERVICE_FQDN__`). Falls back to 3000. + */ + port?: number; +} + +export interface CoolifyPostDeployResult { + ok: boolean; + steps: { + envRewrite: { ok: boolean; detail: string }; + portLabel: { ok: boolean; detail: string }; + proxyNetwork: { ok: boolean; detail: string }; + recreate: { ok: boolean; detail: string }; + proxyRestart: { ok: boolean; detail: string }; + }; +} + +/** + * Embed a Python script (UTF-8 bytes, base64-encoded) as a here-doc + * arg to a docker-run that mounts the resource's compose dir at /work + * and exposes the inputs as env vars. We use base64 to sidestep all + * shell-escaping issues with python triple-quoted strings. + */ +function buildPythonRunner(script: string, env: Record, dir: string, networkAttach = false): string { + const b64 = Buffer.from(script, 'utf8').toString('base64'); + const envFlags = Object.entries(env) + .map(([k, v]) => `-e ${sq(`${k}=${v}`)}`) + .join(' '); + // We need a Python image with sed-style file editing. python:3-alpine + // is ~50MB and ships with regex + os out of the box. + return [ + `echo ${sq(b64)} | base64 -d |`, + 'docker run --rm -i', + `-v ${sq(`${dir}:/work`)}`, + networkAttach ? '-v /var/run/docker.sock:/var/run/docker.sock' : '', + envFlags, + 'python:3-alpine', + 'python -', + ].filter(Boolean).join(' '); +} + +/** + * Apply the three post-deploy fixes to a freshly-deployed Coolify + * service so the user-facing URL works on the very first hit. + * + * Idempotent. Safe to call multiple times — each step detects + * whether the change is already in place and no-ops if so. + * + * Sequencing: + * 1. Rewrite .env's SERVICE_FQDN_* / SERVICE_URL_* (cosmetic for + * Traefik but critical for any frontend that bakes the URL into + * its bundle from these env vars at startup). + * 2. Inject the missing `loadbalancer.server.port` label into the + * compose file. + * 3. Connect coolify-proxy to the project network so Traefik can + * reach the public container by its compose name. + * 4. `docker compose up -d --force-recreate ` — this + * applies the new env (step 1) and label (step 2) without + * touching internal services like postgres/redis (which would + * cause DNS collisions if their networks changed). + * 5. `docker restart coolify-proxy` so Traefik re-discovers the + * newly-attached network and the recreated container's labels. + */ +export async function applyCoolifyPostDeployFixes( + opts: CoolifyPostDeployOptions, +): Promise { + const { uuid, fqdn, publicAppName, port = 3000 } = opts; + const dir = composeDir('service', uuid); + + const result: CoolifyPostDeployResult = { + ok: false, + steps: { + envRewrite: { ok: false, detail: '' }, + portLabel: { ok: false, detail: '' }, + proxyNetwork: { ok: false, detail: '' }, + recreate: { ok: false, detail: '' }, + proxyRestart: { ok: false, detail: '' }, + }, + }; + + // ── Step 1+2 fused: rewrite .env + inject port label in one Python pass + const editorScript = ` +import os, re, sys + +env_file = "/work/.env" +compose_file = "/work/docker-compose.yml" +fqdn = os.environ["NEW_FQDN"] +app = os.environ["APP"] # e.g. "twenty" +APP = app.upper() +uuid = os.environ["UUID"] +port = os.environ["PORT"] + +env_changes = [] +if os.path.exists(env_file): + with open(env_file, "r", encoding="utf-8") as f: + lines = f.readlines() + out = [] + for line in lines: + new = line + # SERVICE_FQDN_= + if re.match(rf"^SERVICE_FQDN_{re.escape(APP)}=", line): + new = f"SERVICE_FQDN_{APP}={fqdn}\\n" + # SERVICE_URL_= + elif re.match(rf"^SERVICE_URL_{re.escape(APP)}=", line): + new = f"SERVICE_URL_{APP}=https://{fqdn}\\n" + else: + m = re.match(rf"^SERVICE_FQDN_{re.escape(APP)}_(\\d+)=", line) + if m: + new = f"SERVICE_FQDN_{APP}_{m.group(1)}={fqdn}:{m.group(1)}\\n" + else: + m = re.match(rf"^SERVICE_URL_{re.escape(APP)}_(\\d+)=", line) + if m: + new = f"SERVICE_URL_{APP}_{m.group(1)}=https://{fqdn}:{m.group(1)}\\n" + if new != line: + env_changes.append(line.strip() + " => " + new.strip()) + out.append(new) + with open(env_file, "w", encoding="utf-8") as f: + f.writelines(out) + +# Inject port label into compose if missing. +label_changes = [] +svc_id = f"{app}-svc-{uuid}" +needed_router_svc = f"traefik.http.routers.https-0-{uuid}-{app}.service={svc_id}" +needed_loadbalance = f"traefik.http.services.{svc_id}.loadbalancer.server.port={port}" +http_router_svc = f"traefik.http.routers.http-0-{uuid}-{app}.service={svc_id}" + +with open(compose_file, "r", encoding="utf-8") as f: + s = f.read() + +if needed_loadbalance not in s: + # Anchor: the existing tls=true label for the https router. + anchor = f"traefik.http.routers.https-0-{uuid}-{app}.tls=true" + if anchor in s: + replacement = ( + anchor + + "\\n - " + http_router_svc + + "\\n - " + needed_router_svc + + "\\n - " + needed_loadbalance + ) + s = s.replace(anchor, replacement, 1) # only on the twenty service block + with open(compose_file, "w", encoding="utf-8") as f: + f.write(s) + label_changes.append(f"injected loadbalancer.server.port={port}") + else: + label_changes.append(f"WARN: anchor '{anchor}' not found; label NOT injected") +else: + label_changes.append("loadbalancer.server.port already present") + +print("ENV_CHANGES:" + str(len(env_changes))) +for c in env_changes: + print(" " + c) +print("LABEL_CHANGES:") +for c in label_changes: + print(" " + c) +`; + + try { + const cmd = buildPythonRunner( + editorScript, + { NEW_FQDN: fqdn, APP: publicAppName, UUID: uuid, PORT: String(port) }, + dir, + ); + const r = await runOnCoolifyHost(cmd, { timeoutMs: 60_000 }); + if (r.code === 0) { + const text = r.stdout.trim().slice(-1500); + result.steps.envRewrite = { ok: true, detail: text }; + result.steps.portLabel = { ok: !text.includes('WARN:'), detail: text }; + } else { + const detail = (r.stderr || r.stdout).trim().slice(-500); + result.steps.envRewrite = { ok: false, detail }; + result.steps.portLabel = { ok: false, detail }; + } + } catch (e) { + const detail = e instanceof Error ? e.message : String(e); + result.steps.envRewrite = { ok: false, detail }; + result.steps.portLabel = { ok: false, detail }; + } + + // ── Step 3: attach coolify-proxy to project network + try { + // `|| true` swallows the "endpoint with name coolify-proxy already + // exists in network" error which is the success-already-applied case. + const r = await runOnCoolifyHost( + `docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`, + { timeoutMs: 10_000 }, + ); + const text = (r.stdout || r.stderr).trim(); + const alreadyAttached = /already exists/i.test(text); + result.steps.proxyNetwork = { + ok: true, + detail: alreadyAttached ? 'already attached' : (text || 'attached'), + }; + } catch (e) { + result.steps.proxyNetwork = { + ok: false, + detail: e instanceof Error ? e.message : String(e), + }; + } + + // ── Step 4: recreate ONLY the public app to apply env+label changes + // (not the whole stack — postgres/redis/worker stay where they are) + try { + const r = await composeRun('service', uuid, ['up', '-d', '--force-recreate', publicAppName], { + timeoutMs: 300_000, + }); + const detail = (r.stderr || r.stdout) + .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') + .replace(/[\x00-\x08\x0B-\x1F]/g, '') + .trim() + .slice(-400); + // compose returns 0 on success, non-zero on partial failure; + // sidecar `depends_on` timeouts can produce a non-zero exit + // even though the public container started successfully. + const probe = await runOnCoolifyHost( + `docker ps --filter name=${publicAppName}-${uuid} --format '{{.Names}}'`, + { timeoutMs: 8_000 }, + ); + const running = probe.stdout.trim().length > 0; + result.steps.recreate = { + ok: running, + detail: running ? `${publicAppName}-${uuid} running` : detail, + }; + } catch (e) { + result.steps.recreate = { + ok: false, + detail: e instanceof Error ? e.message : String(e), + }; + } + + // ── Step 5: nudge Traefik to re-discover via proxy restart + try { + const r = await runOnCoolifyHost(`docker restart coolify-proxy`, { timeoutMs: 30_000 }); + result.steps.proxyRestart = { + ok: r.code === 0, + detail: r.code === 0 ? 'restarted' : (r.stderr || r.stdout).trim().slice(-200), + }; + } catch (e) { + result.steps.proxyRestart = { + ok: false, + detail: e instanceof Error ? e.message : String(e), + }; + } + + result.ok = Object.values(result.steps).every(s => s.ok); + return result; +}