/** * Surgical post-deploy fixes for Coolify-managed Services. * * Why this exists * --------------- * Coolify's service-template deploy pipeline gets us 90% of the way: * it generates a docker-compose.yml + .env, runs `docker compose up`, * sets up volumes, and writes Traefik labels. But for many templates * (including the popular twenty/n8n/ghost/etc.) it consistently fails * to do three host-level things that the public REST API does NOT * expose: * * 1. Rewrite the auto-generated `SERVICE_FQDN_*` / `SERVICE_URL_*` * env vars from sslip.io defaults to the user's real FQDN. The * user's domain is correctly stored on `service.applications[].fqdn` * (so Traefik routing rules use it), but the env vars that the * app embeds into its frontend bundle (e.g. Twenty's SERVER_URL) * keep pointing at sslip.io. Result: SPA loads on real HTTPS * then makes XHRs to insecure sslip.io URLs → "Mixed Content" * errors and the app appears broken. * * 2. Generate the `traefik.http.services..loadbalancer.server.port` * label. Without it Traefik logs `error: port is missing` and * returns 503 on every request. * * 3. Connect `coolify-proxy` to the resource's project network. * Coolify generates a label `caddy_ingress_network=` * hinting that the proxy SHOULD live there, but never actually * runs `docker network connect`. Result: even if Traefik * discovers the right routing rules, it can't reach the upstream * container. * * This module fixes all three after Coolify's queue finishes its work. * * Permissions model * ----------------- * The `vibn-logs` SSH user has docker-group membership but no shell * sudo and no read access to `/data/coolify/services//` (Coolify * chmods that to 0700 root). We work around both by running a one-shot * `python:alpine` container that bind-mounts the path. The docker * daemon runs as root so it can read the directory; vibn-logs only * needs the docker socket. */ import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh'; /** Slug for the Coolify-managed compose dir. */ export type ResourceKind = 'service' | 'application'; function composeDir(kind: ResourceKind, uuid: string): string { return kind === 'service' ? `/data/coolify/services/${uuid}` : `/data/coolify/applications/${uuid}`; } /** Shell-quote a single argument as a POSIX single-quoted string. */ function sq(s: string): string { return `'${String(s).replace(/'/g, `'\\''`)}'`; } // ───────────────────────────────────────────────────────────────────── // Manual recovery helpers (apps.containers.up / .ps) // ───────────────────────────────────────────────────────────────────── /** * Run a `docker compose` subcommand inside the rendered compose dir * via a one-shot `docker:cli` container. Used by `apps.containers.up` * and `apps.containers.ps` for manual user recovery. * * Note: the *deployment* path (apps.create) no longer uses this * helper. apps.create lets Coolify's own queue do the deploy, then * applies the post-deploy fixes via `applyCoolifyPostDeployFixes`. */ async function composeRun( kind: ResourceKind, uuid: string, args: string[], opts: { timeoutMs?: number } = {}, ): Promise { const dir = composeDir(kind, uuid); const cmd = [ 'docker', 'run', '--rm', '-v', sq(`${dir}:/work`), '-w', '/work', '-v', '/var/run/docker.sock:/var/run/docker.sock', '--network', 'host', 'docker:cli', 'compose', ...args.map(sq), ].join(' '); return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 }); } /** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */ export async function composeUp( kind: ResourceKind, uuid: string, opts: { timeoutMs?: number } = {}, ): Promise { return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts); } /** `docker compose down` — stops + removes containers; volumes preserved. */ export async function composeDown( kind: ResourceKind, uuid: string, opts: { timeoutMs?: number } = {}, ): Promise { return composeRun(kind, uuid, ['down'], opts); } /** `docker compose ps -a` — exposed as `apps.containers.ps`. */ export async function composePs( kind: ResourceKind, uuid: string, ): Promise { return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 }); } /** * Verify the rendered compose dir exists. Returns a friendly * null-on-missing instead of an opaque ENOENT. */ export async function composeDirExists( kind: ResourceKind, uuid: string, ): Promise { const dir = composeDir(kind, uuid); const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`; const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 }); return r.stdout.trim().endsWith('OK'); } // ───────────────────────────────────────────────────────────────────── // Post-deploy fixes (apps.create's reliability layer) // ───────────────────────────────────────────────────────────────────── export interface CoolifyPostDeployOptions { /** Coolify service UUID. */ uuid: string; /** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */ fqdn: string; /** Compose service name of the user-facing app, e.g. "twenty". */ publicAppName: string; /** * HTTP port the public app listens on inside the container. * If omitted, we try to detect it from `.env` (looking for * `SERVICE_FQDN__`). Falls back to 3000. */ port?: number; } export interface CoolifyPostDeployResult { ok: boolean; steps: { envRewrite: { ok: boolean; detail: string }; portLabel: { ok: boolean; detail: string }; proxyNetwork: { ok: boolean; detail: string }; recreate: { ok: boolean; detail: string }; proxyRestart: { ok: boolean; detail: string }; }; } /** * Embed a Python script (UTF-8 bytes, base64-encoded) as a here-doc * arg to a docker-run that mounts the resource's compose dir at /work * and exposes the inputs as env vars. We use base64 to sidestep all * shell-escaping issues with python triple-quoted strings. */ function buildPythonRunner(script: string, env: Record, dir: string, networkAttach = false): string { const b64 = Buffer.from(script, 'utf8').toString('base64'); const envFlags = Object.entries(env) .map(([k, v]) => `-e ${sq(`${k}=${v}`)}`) .join(' '); // We need a Python image with sed-style file editing. python:3-alpine // is ~50MB and ships with regex + os out of the box. return [ `echo ${sq(b64)} | base64 -d |`, 'docker run --rm -i', `-v ${sq(`${dir}:/work`)}`, networkAttach ? '-v /var/run/docker.sock:/var/run/docker.sock' : '', envFlags, 'python:3-alpine', 'python -', ].filter(Boolean).join(' '); } /** * Apply the three post-deploy fixes to a freshly-deployed Coolify * service so the user-facing URL works on the very first hit. * * Idempotent. Safe to call multiple times — each step detects * whether the change is already in place and no-ops if so. * * Sequencing: * 1. Rewrite .env's SERVICE_FQDN_* / SERVICE_URL_* (cosmetic for * Traefik but critical for any frontend that bakes the URL into * its bundle from these env vars at startup). * 2. Inject the missing `loadbalancer.server.port` label into the * compose file. * 3. Connect coolify-proxy to the project network so Traefik can * reach the public container by its compose name. * 4. `docker compose up -d --force-recreate ` — this * applies the new env (step 1) and label (step 2) without * touching internal services like postgres/redis (which would * cause DNS collisions if their networks changed). * 5. `docker restart coolify-proxy` so Traefik re-discovers the * newly-attached network and the recreated container's labels. */ export async function applyCoolifyPostDeployFixes( opts: CoolifyPostDeployOptions, ): Promise { const { uuid, fqdn, publicAppName, port = 3000 } = opts; const dir = composeDir('service', uuid); const result: CoolifyPostDeployResult = { ok: false, steps: { envRewrite: { ok: false, detail: '' }, portLabel: { ok: false, detail: '' }, proxyNetwork: { ok: false, detail: '' }, recreate: { ok: false, detail: '' }, proxyRestart: { ok: false, detail: '' }, }, }; // ── Step 1+2 fused: rewrite .env + inject port label in one Python pass const editorScript = ` import os, re, sys env_file = "/work/.env" compose_file = "/work/docker-compose.yml" fqdn = os.environ["NEW_FQDN"] app = os.environ["APP"] # e.g. "twenty" APP = app.upper() uuid = os.environ["UUID"] port = os.environ["PORT"] env_changes = [] if os.path.exists(env_file): with open(env_file, "r", encoding="utf-8") as f: lines = f.readlines() out = [] for line in lines: new = line # SERVICE_FQDN_= if re.match(rf"^SERVICE_FQDN_{re.escape(APP)}=", line): new = f"SERVICE_FQDN_{APP}={fqdn}\\n" # SERVICE_URL_= elif re.match(rf"^SERVICE_URL_{re.escape(APP)}=", line): new = f"SERVICE_URL_{APP}=https://{fqdn}\\n" else: m = re.match(rf"^SERVICE_FQDN_{re.escape(APP)}_(\\d+)=", line) if m: new = f"SERVICE_FQDN_{APP}_{m.group(1)}={fqdn}:{m.group(1)}\\n" else: m = re.match(rf"^SERVICE_URL_{re.escape(APP)}_(\\d+)=", line) if m: new = f"SERVICE_URL_{APP}_{m.group(1)}=https://{fqdn}:{m.group(1)}\\n" if new != line: env_changes.append(line.strip() + " => " + new.strip()) out.append(new) with open(env_file, "w", encoding="utf-8") as f: f.writelines(out) # Inject port label into compose if missing. label_changes = [] svc_id = f"{app}-svc-{uuid}" needed_router_svc = f"traefik.http.routers.https-0-{uuid}-{app}.service={svc_id}" needed_loadbalance = f"traefik.http.services.{svc_id}.loadbalancer.server.port={port}" http_router_svc = f"traefik.http.routers.http-0-{uuid}-{app}.service={svc_id}" with open(compose_file, "r", encoding="utf-8") as f: s = f.read() if needed_loadbalance not in s: # Anchor: the existing tls=true label for the https router. anchor = f"traefik.http.routers.https-0-{uuid}-{app}.tls=true" if anchor in s: replacement = ( anchor + "\\n - " + http_router_svc + "\\n - " + needed_router_svc + "\\n - " + needed_loadbalance ) s = s.replace(anchor, replacement, 1) # only on the twenty service block with open(compose_file, "w", encoding="utf-8") as f: f.write(s) label_changes.append(f"injected loadbalancer.server.port={port}") else: label_changes.append(f"WARN: anchor '{anchor}' not found; label NOT injected") else: label_changes.append("loadbalancer.server.port already present") print("ENV_CHANGES:" + str(len(env_changes))) for c in env_changes: print(" " + c) print("LABEL_CHANGES:") for c in label_changes: print(" " + c) `; try { const cmd = buildPythonRunner( editorScript, { NEW_FQDN: fqdn, APP: publicAppName, UUID: uuid, PORT: String(port) }, dir, ); const r = await runOnCoolifyHost(cmd, { timeoutMs: 60_000 }); if (r.code === 0) { const text = r.stdout.trim().slice(-1500); result.steps.envRewrite = { ok: true, detail: text }; result.steps.portLabel = { ok: !text.includes('WARN:'), detail: text }; } else { const detail = (r.stderr || r.stdout).trim().slice(-500); result.steps.envRewrite = { ok: false, detail }; result.steps.portLabel = { ok: false, detail }; } } catch (e) { const detail = e instanceof Error ? e.message : String(e); result.steps.envRewrite = { ok: false, detail }; result.steps.portLabel = { ok: false, detail }; } // ── Step 3: attach coolify-proxy to project network try { // `|| true` swallows the "endpoint with name coolify-proxy already // exists in network" error which is the success-already-applied case. const r = await runOnCoolifyHost( `docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`, { timeoutMs: 10_000 }, ); const text = (r.stdout || r.stderr).trim(); const alreadyAttached = /already exists/i.test(text); result.steps.proxyNetwork = { ok: true, detail: alreadyAttached ? 'already attached' : (text || 'attached'), }; } catch (e) { result.steps.proxyNetwork = { ok: false, detail: e instanceof Error ? e.message : String(e), }; } // ── Step 4: recreate ONLY the public app to apply env+label changes // (not the whole stack — postgres/redis/worker stay where they are) try { const r = await composeRun('service', uuid, ['up', '-d', '--force-recreate', publicAppName], { timeoutMs: 300_000, }); const detail = (r.stderr || r.stdout) .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') .replace(/[\x00-\x08\x0B-\x1F]/g, '') .trim() .slice(-400); // compose returns 0 on success, non-zero on partial failure; // sidecar `depends_on` timeouts can produce a non-zero exit // even though the public container started successfully. const probe = await runOnCoolifyHost( `docker ps --filter name=${publicAppName}-${uuid} --format '{{.Names}}'`, { timeoutMs: 8_000 }, ); const running = probe.stdout.trim().length > 0; result.steps.recreate = { ok: running, detail: running ? `${publicAppName}-${uuid} running` : detail, }; } catch (e) { result.steps.recreate = { ok: false, detail: e instanceof Error ? e.message : String(e), }; } // ── Step 5: nudge Traefik to re-discover via proxy restart try { const r = await runOnCoolifyHost(`docker restart coolify-proxy`, { timeoutMs: 30_000 }); result.steps.proxyRestart = { ok: r.code === 0, detail: r.code === 0 ? 'restarted' : (r.stderr || r.stdout).trim().slice(-200), }; } catch (e) { result.steps.proxyRestart = { ok: false, detail: e instanceof Error ? e.message : String(e), }; } result.ok = Object.values(result.steps).every(s => s.ok); return result; }