235 lines
8.9 KiB
TypeScript
235 lines
8.9 KiB
TypeScript
/**
|
|
* Surgical post-deploy fix for Coolify-managed Services.
|
|
*
|
|
* Why this exists
|
|
* ---------------
|
|
* Coolify's service-template deploy pipeline gets us 99% of the way
|
|
* — IF apps.create passes the upstream port in the URL it gives to
|
|
* `setServiceDomains` (e.g. `https://crm.mark.vibnai.com:3000`).
|
|
* With that port suffix Coolify auto-generates everything that
|
|
* matters: the loadbalancer.server.port Traefik label, the rewritten
|
|
* SERVICE_FQDN_<APP> / SERVICE_URL_<APP> env vars (no sslip.io
|
|
* leakage), and the correct routing rules.
|
|
*
|
|
* The one thing Coolify still misses is connecting `coolify-proxy`
|
|
* to the resource's project Docker network. Coolify writes a
|
|
* `caddy_ingress_network=<uuid>` hint label but never runs
|
|
* `docker network connect`, so Traefik discovers the right routing
|
|
* rules but cannot reach the upstream container — every request
|
|
* returns Traefik 503.
|
|
*
|
|
* That's the entire purpose of this module: attach `coolify-proxy`
|
|
* to the project network, then nudge Traefik to re-discover.
|
|
*
|
|
* History
|
|
* -------
|
|
* Versions 2.4.5 → 2.4.7 also rewrote `.env` and injected the
|
|
* loadbalancer port label via an embedded Python script run inside a
|
|
* `python:3-alpine` container. That code became unnecessary in 2.4.8
|
|
* once we discovered the `:port` URL convention; it's been removed
|
|
* along with the `python:alpine` SSH dependency.
|
|
*/
|
|
|
|
import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh';
|
|
|
|
/** Slug for the Coolify-managed compose dir. */
|
|
export type ResourceKind = 'service' | 'application';
|
|
|
|
function composeDir(kind: ResourceKind, uuid: string): string {
|
|
return kind === 'service'
|
|
? `/data/coolify/services/${uuid}`
|
|
: `/data/coolify/applications/${uuid}`;
|
|
}
|
|
|
|
/** Shell-quote a single argument as a POSIX single-quoted string. */
|
|
function sq(s: string): string {
|
|
return `'${String(s).replace(/'/g, `'\\''`)}'`;
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
// Manual recovery helpers (apps.containers.up / .ps)
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
|
|
/**
|
|
* Run a `docker compose` subcommand inside the rendered compose dir
|
|
* via a one-shot `docker:cli` container. Used by `apps.containers.up`
|
|
* and `apps.containers.ps` for manual user recovery.
|
|
*
|
|
* Note: the *deployment* path (apps.create) no longer uses this
|
|
* helper. apps.create lets Coolify's own queue do the deploy, then
|
|
* applies the post-deploy fixes via `applyCoolifyPostDeployFixes`.
|
|
*/
|
|
async function composeRun(
|
|
kind: ResourceKind,
|
|
uuid: string,
|
|
args: string[],
|
|
opts: { timeoutMs?: number } = {},
|
|
): Promise<CoolifySshResult> {
|
|
const dir = composeDir(kind, uuid);
|
|
const cmd = [
|
|
'docker', 'run', '--rm',
|
|
'-v', sq(`${dir}:/work`),
|
|
'-w', '/work',
|
|
'-v', '/var/run/docker.sock:/var/run/docker.sock',
|
|
'--network', 'host',
|
|
'docker:cli',
|
|
'compose', ...args.map(sq),
|
|
].join(' ');
|
|
return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 });
|
|
}
|
|
|
|
/** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */
|
|
export async function composeUp(
|
|
kind: ResourceKind,
|
|
uuid: string,
|
|
opts: { timeoutMs?: number } = {},
|
|
): Promise<CoolifySshResult> {
|
|
return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts);
|
|
}
|
|
|
|
/** `docker compose down` — stops + removes containers; volumes preserved. */
|
|
export async function composeDown(
|
|
kind: ResourceKind,
|
|
uuid: string,
|
|
opts: { timeoutMs?: number } = {},
|
|
): Promise<CoolifySshResult> {
|
|
return composeRun(kind, uuid, ['down'], opts);
|
|
}
|
|
|
|
/** `docker compose ps -a` — exposed as `apps.containers.ps`. */
|
|
export async function composePs(
|
|
kind: ResourceKind,
|
|
uuid: string,
|
|
): Promise<CoolifySshResult> {
|
|
return composeRun(kind, uuid, ['ps', '-a', '--format', 'table'], { timeoutMs: 30_000 });
|
|
}
|
|
|
|
/**
|
|
* Verify the rendered compose dir exists. Returns a friendly
|
|
* null-on-missing instead of an opaque ENOENT.
|
|
*/
|
|
export async function composeDirExists(
|
|
kind: ResourceKind,
|
|
uuid: string,
|
|
): Promise<boolean> {
|
|
const dir = composeDir(kind, uuid);
|
|
const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`;
|
|
const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 });
|
|
return r.stdout.trim().endsWith('OK');
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
// Post-deploy fixes (apps.create's reliability layer)
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
|
|
export interface CoolifyPostDeployOptions {
|
|
/** Coolify service UUID. */
|
|
uuid: string;
|
|
/** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */
|
|
fqdn: string;
|
|
/** Compose service name of the user-facing app, e.g. "twenty". */
|
|
publicAppName: string;
|
|
/**
|
|
* HTTP port the public app listens on inside the container. Optional
|
|
* here — kept for back-compat and diagnostics; the actual port
|
|
* routing is wired by Coolify itself based on the URL passed to
|
|
* setServiceDomains, not by this helper.
|
|
*/
|
|
port?: number;
|
|
}
|
|
|
|
export interface CoolifyPostDeployResult {
|
|
ok: boolean;
|
|
steps: {
|
|
proxyNetwork: { ok: boolean; detail: string };
|
|
proxyRestart: { ok: boolean; detail: string };
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Apply the post-deploy fix to a freshly-deployed Coolify service so
|
|
* the user-facing URL works on the very first hit.
|
|
*
|
|
* Idempotent. Safe to call multiple times. Coolify-version-tolerant —
|
|
* if a future Coolify already attaches the proxy network itself, both
|
|
* steps no-op cleanly.
|
|
*
|
|
* Sequencing:
|
|
* 1. `docker network connect <uuid> coolify-proxy` so Traefik can
|
|
* reach the public container by its compose name. This is the
|
|
* ONE thing Coolify omits despite writing the
|
|
* `caddy_ingress_network=<uuid>` hint label.
|
|
* 2. Background `docker restart coolify-proxy` (fired off via
|
|
* nohup) so Traefik re-discovers the newly-attached network. We
|
|
* can't restart it synchronously because coolify-proxy is the
|
|
* same gateway serving this very HTTP request — see step 2's
|
|
* comment for the gory detail.
|
|
*/
|
|
export async function applyCoolifyPostDeployFixes(
|
|
opts: CoolifyPostDeployOptions,
|
|
): Promise<CoolifyPostDeployResult> {
|
|
const { uuid } = opts;
|
|
|
|
const result: CoolifyPostDeployResult = {
|
|
ok: false,
|
|
steps: {
|
|
proxyNetwork: { ok: false, detail: '' },
|
|
proxyRestart: { ok: false, detail: '' },
|
|
},
|
|
};
|
|
|
|
// ── Step 1: attach coolify-proxy to project network
|
|
try {
|
|
// `|| true` swallows the "endpoint with name coolify-proxy already
|
|
// exists in network" error which is the success-already-applied case.
|
|
const r = await runOnCoolifyHost(
|
|
`docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`,
|
|
{ timeoutMs: 10_000 },
|
|
);
|
|
const text = (r.stdout || r.stderr).trim();
|
|
const alreadyAttached = /already exists/i.test(text);
|
|
result.steps.proxyNetwork = {
|
|
ok: true,
|
|
detail: alreadyAttached ? 'already attached' : (text || 'attached'),
|
|
};
|
|
} catch (e) {
|
|
result.steps.proxyNetwork = {
|
|
ok: false,
|
|
detail: e instanceof Error ? e.message : String(e),
|
|
};
|
|
}
|
|
|
|
// ── Step 2: nudge Traefik to re-discover via proxy restart.
|
|
//
|
|
// CAUTION: coolify-proxy is the same gateway that's currently
|
|
// serving this very HTTP request (the agent → vibnai.com call that
|
|
// landed in this handler). If we run a synchronous `docker restart
|
|
// coolify-proxy`, the connection is killed mid-flight and the agent
|
|
// sees a curl error 16 (HTTP/2 framing) instead of our nicely
|
|
// formatted result object. We fire-and-forget instead: the SSH
|
|
// command returns within ~50ms, we finish the HTTP response, and
|
|
// the proxy restarts ~3s later — by which time the response has
|
|
// already been delivered and Traefik will re-discover labels for
|
|
// any subsequent request.
|
|
try {
|
|
const r = await runOnCoolifyHost(
|
|
`nohup sh -c '(sleep 3 && docker restart coolify-proxy) >/tmp/coolify-proxy-restart.log 2>&1' </dev/null >/dev/null 2>&1 &`,
|
|
{ timeoutMs: 8_000 },
|
|
);
|
|
result.steps.proxyRestart = {
|
|
ok: r.code === 0,
|
|
detail: r.code === 0
|
|
? 'scheduled (background, +3s after response)'
|
|
: (r.stderr || r.stdout).trim().slice(-200),
|
|
};
|
|
} catch (e) {
|
|
result.steps.proxyRestart = {
|
|
ok: false,
|
|
detail: e instanceof Error ? e.message : String(e),
|
|
};
|
|
}
|
|
|
|
result.ok = Object.values(result.steps).every(s => s.ok);
|
|
return result;
|
|
}
|