Files
vibn-frontend/lib/coolify-compose.ts

235 lines
8.9 KiB
TypeScript

/**
* Surgical post-deploy fix for Coolify-managed Services.
*
* Why this exists
* ---------------
* Coolify's service-template deploy pipeline gets us 99% of the way
* — IF apps.create passes the upstream port in the URL it gives to
* `setServiceDomains` (e.g. `https://crm.mark.vibnai.com:3000`).
* With that port suffix Coolify auto-generates everything that
* matters: the loadbalancer.server.port Traefik label, the rewritten
* SERVICE_FQDN_<APP> / SERVICE_URL_<APP> env vars (no sslip.io
* leakage), and the correct routing rules.
*
* The one thing Coolify still misses is connecting `coolify-proxy`
* to the resource's project Docker network. Coolify writes a
* `caddy_ingress_network=<uuid>` hint label but never runs
* `docker network connect`, so Traefik discovers the right routing
* rules but cannot reach the upstream container — every request
* returns Traefik 503.
*
* That's the entire purpose of this module: attach `coolify-proxy`
* to the project network, then nudge Traefik to re-discover.
*
* History
* -------
* Versions 2.4.5 → 2.4.7 also rewrote `.env` and injected the
* loadbalancer port label via an embedded Python script run inside a
* `python:3-alpine` container. That code became unnecessary in 2.4.8
* once we discovered the `:port` URL convention; it's been removed
* along with the `python:alpine` SSH dependency.
*/
import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh';
/** Slug for the Coolify-managed compose dir. */
export type ResourceKind = 'service' | 'application';
function composeDir(kind: ResourceKind, uuid: string): string {
return kind === 'service'
? `/data/coolify/services/${uuid}`
: `/data/coolify/applications/${uuid}`;
}
/** Shell-quote a single argument as a POSIX single-quoted string. */
function sq(s: string): string {
return `'${String(s).replace(/'/g, `'\\''`)}'`;
}
// ─────────────────────────────────────────────────────────────────────
// Manual recovery helpers (apps.containers.up / .ps)
// ─────────────────────────────────────────────────────────────────────
/**
* Run a `docker compose` subcommand inside the rendered compose dir
* via a one-shot `docker:cli` container. Used by `apps.containers.up`
* and `apps.containers.ps` for manual user recovery.
*
* Note: the *deployment* path (apps.create) no longer uses this
* helper. apps.create lets Coolify's own queue do the deploy, then
* applies the post-deploy fixes via `applyCoolifyPostDeployFixes`.
*/
async function composeRun(
kind: ResourceKind,
uuid: string,
args: string[],
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
const dir = composeDir(kind, uuid);
const cmd = [
'docker', 'run', '--rm',
'-v', sq(`${dir}:/work`),
'-w', '/work',
'-v', '/var/run/docker.sock:/var/run/docker.sock',
'--network', 'host',
'docker:cli',
'compose', ...args.map(sq),
].join(' ');
return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 });
}
/** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */
export async function composeUp(
kind: ResourceKind,
uuid: string,
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts);
}
/** `docker compose down` — stops + removes containers; volumes preserved. */
export async function composeDown(
kind: ResourceKind,
uuid: string,
opts: { timeoutMs?: number } = {},
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['down'], opts);
}
/** `docker compose ps -a` — exposed as `apps.containers.ps`. */
export async function composePs(
kind: ResourceKind,
uuid: string,
): Promise<CoolifySshResult> {
return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 });
}
/**
* Verify the rendered compose dir exists. Returns a friendly
* null-on-missing instead of an opaque ENOENT.
*/
export async function composeDirExists(
kind: ResourceKind,
uuid: string,
): Promise<boolean> {
const dir = composeDir(kind, uuid);
const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`;
const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 });
return r.stdout.trim().endsWith('OK');
}
// ─────────────────────────────────────────────────────────────────────
// Post-deploy fixes (apps.create's reliability layer)
// ─────────────────────────────────────────────────────────────────────
export interface CoolifyPostDeployOptions {
/** Coolify service UUID. */
uuid: string;
/** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */
fqdn: string;
/** Compose service name of the user-facing app, e.g. "twenty". */
publicAppName: string;
/**
* HTTP port the public app listens on inside the container. Optional
* here — kept for back-compat and diagnostics; the actual port
* routing is wired by Coolify itself based on the URL passed to
* setServiceDomains, not by this helper.
*/
port?: number;
}
export interface CoolifyPostDeployResult {
ok: boolean;
steps: {
proxyNetwork: { ok: boolean; detail: string };
proxyRestart: { ok: boolean; detail: string };
};
}
/**
* Apply the post-deploy fix to a freshly-deployed Coolify service so
* the user-facing URL works on the very first hit.
*
* Idempotent. Safe to call multiple times. Coolify-version-tolerant —
* if a future Coolify already attaches the proxy network itself, both
* steps no-op cleanly.
*
* Sequencing:
* 1. `docker network connect <uuid> coolify-proxy` so Traefik can
* reach the public container by its compose name. This is the
* ONE thing Coolify omits despite writing the
* `caddy_ingress_network=<uuid>` hint label.
* 2. Background `docker restart coolify-proxy` (fired off via
* nohup) so Traefik re-discovers the newly-attached network. We
* can't restart it synchronously because coolify-proxy is the
* same gateway serving this very HTTP request — see step 2's
* comment for the gory detail.
*/
export async function applyCoolifyPostDeployFixes(
opts: CoolifyPostDeployOptions,
): Promise<CoolifyPostDeployResult> {
const { uuid } = opts;
const result: CoolifyPostDeployResult = {
ok: false,
steps: {
proxyNetwork: { ok: false, detail: '' },
proxyRestart: { ok: false, detail: '' },
},
};
// ── Step 1: attach coolify-proxy to project network
try {
// `|| true` swallows the "endpoint with name coolify-proxy already
// exists in network" error which is the success-already-applied case.
const r = await runOnCoolifyHost(
`docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`,
{ timeoutMs: 10_000 },
);
const text = (r.stdout || r.stderr).trim();
const alreadyAttached = /already exists/i.test(text);
result.steps.proxyNetwork = {
ok: true,
detail: alreadyAttached ? 'already attached' : (text || 'attached'),
};
} catch (e) {
result.steps.proxyNetwork = {
ok: false,
detail: e instanceof Error ? e.message : String(e),
};
}
// ── Step 2: nudge Traefik to re-discover via proxy restart.
//
// CAUTION: coolify-proxy is the same gateway that's currently
// serving this very HTTP request (the agent → vibnai.com call that
// landed in this handler). If we run a synchronous `docker restart
// coolify-proxy`, the connection is killed mid-flight and the agent
// sees a curl error 16 (HTTP/2 framing) instead of our nicely
// formatted result object. We fire-and-forget instead: the SSH
// command returns within ~50ms, we finish the HTTP response, and
// the proxy restarts ~3s later — by which time the response has
// already been delivered and Traefik will re-discover labels for
// any subsequent request.
try {
const r = await runOnCoolifyHost(
`nohup sh -c '(sleep 3 && docker restart coolify-proxy) >/tmp/coolify-proxy-restart.log 2>&1' </dev/null >/dev/null 2>&1 &`,
{ timeoutMs: 8_000 },
);
result.steps.proxyRestart = {
ok: r.code === 0,
detail: r.code === 0
? 'scheduled (background, +3s after response)'
: (r.stderr || r.stdout).trim().slice(-200),
};
} catch (e) {
result.steps.proxyRestart = {
ok: false,
detail: e instanceof Error ? e.message : String(e),
};
}
result.ok = Object.values(result.steps).every(s => s.ok);
return result;
}