/** * Surgical post-deploy fix for Coolify-managed Services. * * Why this exists * --------------- * Coolify's service-template deploy pipeline gets us 99% of the way * — IF apps.create passes the upstream port in the URL it gives to * `setServiceDomains` (e.g. `https://crm.mark.vibnai.com:3000`). * With that port suffix Coolify auto-generates everything that * matters: the loadbalancer.server.port Traefik label, the rewritten * SERVICE_FQDN_ / SERVICE_URL_ env vars (no sslip.io * leakage), and the correct routing rules. * * The one thing Coolify still misses is connecting `coolify-proxy` * to the resource's project Docker network. Coolify writes a * `caddy_ingress_network=` hint label but never runs * `docker network connect`, so Traefik discovers the right routing * rules but cannot reach the upstream container — every request * returns Traefik 503. * * That's the entire purpose of this module: attach `coolify-proxy` * to the project network, then nudge Traefik to re-discover. * * History * ------- * Versions 2.4.5 → 2.4.7 also rewrote `.env` and injected the * loadbalancer port label via an embedded Python script run inside a * `python:3-alpine` container. That code became unnecessary in 2.4.8 * once we discovered the `:port` URL convention; it's been removed * along with the `python:alpine` SSH dependency. */ import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh'; /** Slug for the Coolify-managed compose dir. */ export type ResourceKind = 'service' | 'application'; function composeDir(kind: ResourceKind, uuid: string): string { return kind === 'service' ? `/data/coolify/services/${uuid}` : `/data/coolify/applications/${uuid}`; } /** Shell-quote a single argument as a POSIX single-quoted string. */ function sq(s: string): string { return `'${String(s).replace(/'/g, `'\\''`)}'`; } // ───────────────────────────────────────────────────────────────────── // Manual recovery helpers (apps.containers.up / .ps) // ───────────────────────────────────────────────────────────────────── /** * Run a `docker compose` subcommand inside the rendered compose dir * via a one-shot `docker:cli` container. Used by `apps.containers.up` * and `apps.containers.ps` for manual user recovery. * * Note: the *deployment* path (apps.create) no longer uses this * helper. apps.create lets Coolify's own queue do the deploy, then * applies the post-deploy fixes via `applyCoolifyPostDeployFixes`. */ async function composeRun( kind: ResourceKind, uuid: string, args: string[], opts: { timeoutMs?: number } = {}, ): Promise { const dir = composeDir(kind, uuid); const cmd = [ 'docker', 'run', '--rm', '-v', sq(`${dir}:/work`), '-w', '/work', '-v', '/var/run/docker.sock:/var/run/docker.sock', '--network', 'host', 'docker:cli', 'compose', ...args.map(sq), ].join(' '); return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 }); } /** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */ export async function composeUp( kind: ResourceKind, uuid: string, opts: { timeoutMs?: number } = {}, ): Promise { return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts); } /** `docker compose down` — stops + removes containers; volumes preserved. */ export async function composeDown( kind: ResourceKind, uuid: string, opts: { timeoutMs?: number } = {}, ): Promise { return composeRun(kind, uuid, ['down'], opts); } /** `docker compose ps -a` — exposed as `apps.containers.ps`. */ export async function composePs( kind: ResourceKind, uuid: string, ): Promise { return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 }); } /** * Verify the rendered compose dir exists. Returns a friendly * null-on-missing instead of an opaque ENOENT. */ export async function composeDirExists( kind: ResourceKind, uuid: string, ): Promise { const dir = composeDir(kind, uuid); const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`; const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 }); return r.stdout.trim().endsWith('OK'); } // ───────────────────────────────────────────────────────────────────── // Post-deploy fixes (apps.create's reliability layer) // ───────────────────────────────────────────────────────────────────── export interface CoolifyPostDeployOptions { /** Coolify service UUID. */ uuid: string; /** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */ fqdn: string; /** Compose service name of the user-facing app, e.g. "twenty". */ publicAppName: string; /** * HTTP port the public app listens on inside the container. Optional * here — kept for back-compat and diagnostics; the actual port * routing is wired by Coolify itself based on the URL passed to * setServiceDomains, not by this helper. */ port?: number; } export interface CoolifyPostDeployResult { ok: boolean; steps: { proxyNetwork: { ok: boolean; detail: string }; proxyRestart: { ok: boolean; detail: string }; }; } /** * Apply the post-deploy fix to a freshly-deployed Coolify service so * the user-facing URL works on the very first hit. * * Idempotent. Safe to call multiple times. Coolify-version-tolerant — * if a future Coolify already attaches the proxy network itself, both * steps no-op cleanly. * * Sequencing: * 1. `docker network connect coolify-proxy` so Traefik can * reach the public container by its compose name. This is the * ONE thing Coolify omits despite writing the * `caddy_ingress_network=` hint label. * 2. Background `docker restart coolify-proxy` (fired off via * nohup) so Traefik re-discovers the newly-attached network. We * can't restart it synchronously because coolify-proxy is the * same gateway serving this very HTTP request — see step 2's * comment for the gory detail. */ export async function applyCoolifyPostDeployFixes( opts: CoolifyPostDeployOptions, ): Promise { const { uuid } = opts; const result: CoolifyPostDeployResult = { ok: false, steps: { proxyNetwork: { ok: false, detail: '' }, proxyRestart: { ok: false, detail: '' }, }, }; // ── Step 1: attach coolify-proxy to project network try { // `|| true` swallows the "endpoint with name coolify-proxy already // exists in network" error which is the success-already-applied case. const r = await runOnCoolifyHost( `docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`, { timeoutMs: 10_000 }, ); const text = (r.stdout || r.stderr).trim(); const alreadyAttached = /already exists/i.test(text); result.steps.proxyNetwork = { ok: true, detail: alreadyAttached ? 'already attached' : (text || 'attached'), }; } catch (e) { result.steps.proxyNetwork = { ok: false, detail: e instanceof Error ? e.message : String(e), }; } // ── Step 2: nudge Traefik to re-discover via proxy restart. // // CAUTION: coolify-proxy is the same gateway that's currently // serving this very HTTP request (the agent → vibnai.com call that // landed in this handler). If we run a synchronous `docker restart // coolify-proxy`, the connection is killed mid-flight and the agent // sees a curl error 16 (HTTP/2 framing) instead of our nicely // formatted result object. We fire-and-forget instead: the SSH // command returns within ~50ms, we finish the HTTP response, and // the proxy restarts ~3s later — by which time the response has // already been delivered and Traefik will re-discover labels for // any subsequent request. try { const r = await runOnCoolifyHost( `nohup sh -c '(sleep 3 && docker restart coolify-proxy) >/tmp/coolify-proxy-restart.log 2>&1' /dev/null 2>&1 &`, { timeoutMs: 8_000 }, ); result.steps.proxyRestart = { ok: r.code === 0, detail: r.code === 0 ? 'scheduled (background, +3s after response)' : (r.stderr || r.stdout).trim().slice(-200), }; } catch (e) { result.steps.proxyRestart = { ok: false, detail: e instanceof Error ? e.message : String(e), }; } result.ok = Object.values(result.steps).every(s => s.ok); return result; }