fix(mcp v2.4.5): post-deploy fixes replace SSH compose-up fallback
apps.create for service templates now lets Coolify's queue do the
full deploy (compose generation, volumes, internal networking,
healthchecks) and applies three surgical post-deploy fixes that
Coolify's REST API does NOT expose:
1. Rewrites SERVICE_FQDN_* / SERVICE_URL_* in the rendered .env so
frontends that bake their backend URL into the SPA bundle
(Twenty's SERVER_URL, n8n, etc.) point at the real custom domain
instead of the auto-generated sslip.io URL. Without this fix
Twenty's frontend loads on the real HTTPS domain but fires XHRs
at insecure sslip.io, blocking everything as Mixed Content.
2. Injects the missing
traefik.http.services.<svc>.loadbalancer.server.port label.
Coolify generates the routing rules but forgets the port, so
Traefik logs "error: port is missing" and returns 503 forever.
3. Connects coolify-proxy to the project network (Coolify writes a
caddy_ingress_network=<uuid> hint label but never actually runs
docker network connect), then force-recreates ONLY the
public-facing container so the new env+label apply, and
restarts the proxy so Traefik re-discovers.
Polling switches from service.status (which routinely lies as
"starting:unknown" while containers are actually healthy) to the
truthful per-application service.applications[*].status field.
Removes the SSH "docker compose up -d" fallback that v2.4.1-2.4.4
used. That fallback bypassed Coolify's full pipeline, causing
internal services like Postgres/Redis to land on the shared coolify
network where DNS aliases collided with coolify-db/coolify-redis,
producing the "password authentication failed" regression we saw
on Twenty deploys. With v2.4.5 internal services stay on their
isolated project network — only the public app crosses to the
proxy.
Response shape gains: reachable (boolean for HTTPS 2xx/3xx),
appStatus (truthful per-app status from Coolify), postDeploy
(step-by-step diagnostic for each of the three fixes). Existing
started/startDiag fields kept for back-compat.
apps.containers.up / apps.containers.ps remain unchanged for
manual user recovery.
Made-with: Cursor
This commit is contained in:
@@ -32,7 +32,8 @@ import { isCoolifySshConfigured, runOnCoolifyHost } from '@/lib/coolify-ssh';
|
||||
import {
|
||||
composeUp,
|
||||
composePs,
|
||||
attachToCoolifyProxyNetwork,
|
||||
applyCoolifyPostDeployFixes,
|
||||
type CoolifyPostDeployResult,
|
||||
type ResourceKind,
|
||||
} from '@/lib/coolify-compose';
|
||||
import { listContainersForApp } from '@/lib/coolify-containers';
|
||||
@@ -91,7 +92,7 @@ const GITEA_API_URL = process.env.GITEA_API_URL ?? 'https://git.vibnai.com';
|
||||
export async function GET() {
|
||||
return NextResponse.json({
|
||||
name: 'vibn-mcp',
|
||||
version: '2.4.4',
|
||||
version: '2.4.5',
|
||||
authentication: {
|
||||
scheme: 'Bearer',
|
||||
tokenPrefix: 'vibn_sk_',
|
||||
@@ -879,10 +880,16 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
|
||||
}
|
||||
|
||||
let started = false;
|
||||
let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
|
||||
let reachable = false;
|
||||
let appStatus = 'unknown';
|
||||
let postDeploy: CoolifyPostDeployResult | null = null;
|
||||
let startDiag = '';
|
||||
if (params.instantDeploy !== false) {
|
||||
({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
|
||||
({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({
|
||||
uuid: created.uuid,
|
||||
fqdn,
|
||||
publicAppName: templateSlug,
|
||||
}));
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
@@ -895,11 +902,15 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
|
||||
template: templateSlug,
|
||||
urlsApplied,
|
||||
started,
|
||||
startMethod,
|
||||
reachable,
|
||||
appStatus,
|
||||
...(postDeploy ? { postDeploy } : {}),
|
||||
...(startDiag ? { startDiag } : {}),
|
||||
note: started
|
||||
? 'Primary containers are up. First boot may take 1-5 min while images finish pulling and migrations run; use apps.logs to monitor. If startDiag mentions a sidecar dependency timeout (workers, schedulers), call apps.containers.up again once the primary is healthy to bring those up.'
|
||||
: 'Service created but no containers started. Call apps.containers.up to retry; check apps.containers.ps and apps.logs to diagnose.',
|
||||
note: reachable
|
||||
? `Reachable on https://${fqdn}. First boot may continue migrations in the background — check apps.logs if any feature seems missing.`
|
||||
: started
|
||||
? `Containers are healthy but https://${fqdn} did not return 2xx/3xx yet. Wait 30-60s for Traefik to fully discover labels, then retry. If still failing, inspect postDeploy.steps for which fix didn't apply, then call apps.logs and apps.containers.ps.`
|
||||
: `Public app did not become healthy. Use apps.containers.ps and apps.logs to diagnose. Most common cause: image pull is still in progress (first deploy can take 5-10 min for large images like twentycrm/twenty).`,
|
||||
},
|
||||
});
|
||||
}
|
||||
@@ -956,11 +967,23 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
|
||||
}
|
||||
}
|
||||
|
||||
// composeRaw is user-supplied — we can't reliably guess the public
|
||||
// app name (the user may have any compose service layout). Best
|
||||
// effort: use the app name as the public app name, which works for
|
||||
// single-container composes.
|
||||
let started = false;
|
||||
let startMethod: 'coolify-queue' | 'compose-up' | 'failed' = 'failed';
|
||||
let reachable = false;
|
||||
let appStatus = 'unknown';
|
||||
let postDeploy: CoolifyPostDeployResult | null = null;
|
||||
let startDiag = '';
|
||||
if (params.instantDeploy !== false) {
|
||||
({ started, startMethod, diag: startDiag } = await ensureServiceUp(created.uuid));
|
||||
const publicAppName = String(params.publicAppName ?? appName);
|
||||
({ started, reachable, appStatus, postDeploy, diag: startDiag } = await ensureServiceReachable({
|
||||
uuid: created.uuid,
|
||||
fqdn,
|
||||
publicAppName,
|
||||
port: params.port ? Number(params.port) : undefined,
|
||||
}));
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
@@ -971,9 +994,13 @@ async function toolAppsCreate(principal: Principal, params: Record<string, any>)
|
||||
url: `https://${fqdn}`,
|
||||
resourceType: 'service',
|
||||
started,
|
||||
startMethod,
|
||||
reachable,
|
||||
appStatus,
|
||||
...(postDeploy ? { postDeploy } : {}),
|
||||
...(startDiag ? { startDiag } : {}),
|
||||
note: 'Domain routing for compose services must be configured after initial startup — set SERVER_URL env to the desired URL, then call apps.containers.up to apply.',
|
||||
note: reachable
|
||||
? `Reachable on https://${fqdn}.`
|
||||
: `Domain routing for custom compose services depends on knowing which docker-compose service is the public-facing one. Pass publicAppName=<service> and port=<port> on apps.create to enable post-deploy patching, or set them manually.`,
|
||||
},
|
||||
});
|
||||
}
|
||||
@@ -1223,106 +1250,138 @@ async function toolAppsTemplatesSearch(params: Record<string, any>) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure a Coolify Service is actually running (containers exist and
|
||||
* are healthy/starting), with a fallback path for Coolify's flaky
|
||||
* queued-start worker.
|
||||
* Bring a Coolify Service to a publicly-reachable state.
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Call POST /services/{uuid}/start so Coolify's records show
|
||||
* "starting" and any internal hooks fire.
|
||||
* 2. Wait briefly, then probe the host for any container belonging
|
||||
* to this service via `docker ps --filter name={uuid}`.
|
||||
* 3. If no containers materialised, run `docker compose up -d`
|
||||
* directly via SSH against the rendered compose dir. This is
|
||||
* the same command Coolify's worker would run; we just bypass
|
||||
* the unreliable queue.
|
||||
* v2.4.5 architecture
|
||||
* --------------------
|
||||
* Earlier versions ran `docker compose up -d` over SSH as a fallback
|
||||
* when Coolify's queue stalled. That worked for "containers running"
|
||||
* but caused two cascading bugs because it bypassed Coolify's full
|
||||
* deploy pipeline:
|
||||
* - Internal services (Postgres, Redis) ended up on the shared
|
||||
* `coolify` Docker network, where DNS aliases for `postgres`/
|
||||
* `redis` collide with Coolify's own `coolify-db`/`coolify-redis`
|
||||
* containers — Twenty's `postgres://postgres:5432/twenty-db`
|
||||
* resolves to the wrong DB and fails auth.
|
||||
* - The proxy-network attach we did in our SSH path attached EVERY
|
||||
* container, magnifying the same DNS collision.
|
||||
*
|
||||
* Returns:
|
||||
* started true if at least one container is running for this service
|
||||
* startMethod which path got us there
|
||||
* diag human-readable note for failures (truncated stderr)
|
||||
* The right model is: let Coolify's queue do the heavy lifting (it
|
||||
* handles compose generation, volumes, internal networking, env-var
|
||||
* substitution, healthchecks, etc.) and patch the three things its
|
||||
* REST API does NOT expose:
|
||||
* 1. SERVICE_FQDN_* / SERVICE_URL_* env vars in the rendered .env
|
||||
* 2. The missing traefik loadbalancer.server.port label
|
||||
* 3. coolify-proxy → project network attachment + Traefik nudge
|
||||
*
|
||||
* Steps:
|
||||
* 1. POST /services/{uuid}/start — Coolify's queue does its thing.
|
||||
* 2. Poll service.applications[*].status (the per-application
|
||||
* status is truthful; service.status is not). Wait until the
|
||||
* public app reports running:healthy or we time out.
|
||||
* 3. apply post-deploy fixes: rewrite .env, inject port label,
|
||||
* attach proxy to project net, recreate ONLY the public app,
|
||||
* restart proxy so Traefik re-discovers.
|
||||
* 4. (Optional) probe https://<fqdn> for a 200/301/302 to confirm
|
||||
* end-to-end reachability.
|
||||
*/
|
||||
async function ensureServiceUp(uuid: string): Promise<{
|
||||
async function ensureServiceReachable(opts: {
|
||||
uuid: string;
|
||||
fqdn: string;
|
||||
publicAppName: string;
|
||||
port?: number;
|
||||
/** Max wall-clock time to wait for Coolify to bring containers healthy. */
|
||||
healthTimeoutMs?: number;
|
||||
}): Promise<{
|
||||
started: boolean;
|
||||
startMethod: 'coolify-queue' | 'compose-up' | 'failed';
|
||||
reachable: boolean;
|
||||
appStatus: string;
|
||||
postDeploy: CoolifyPostDeployResult | null;
|
||||
diag: string;
|
||||
}> {
|
||||
// 1. Ask Coolify nicely
|
||||
const { uuid, fqdn, publicAppName, port, healthTimeoutMs = 8 * 60_000 } = opts;
|
||||
|
||||
try {
|
||||
await startService(uuid);
|
||||
} catch (e) {
|
||||
console.warn('[ensureServiceUp] startService failed (will fall back)', e);
|
||||
console.warn('[ensureServiceReachable] startService failed', e);
|
||||
}
|
||||
|
||||
// 2. Probe — has the queue actually started anything?
|
||||
if (!isCoolifySshConfigured()) {
|
||||
return { started: true, startMethod: 'coolify-queue', diag: '' };
|
||||
}
|
||||
// Allow up to ~12s for the worker to wake up; checking every 3s.
|
||||
for (let i = 0; i < 4; i++) {
|
||||
await new Promise(r => setTimeout(r, 3_000));
|
||||
// Poll service.applications[*].status until the public app is
|
||||
// running:healthy. This field is truthful, unlike service.status
|
||||
// which routinely lies as "starting:unknown" while containers are
|
||||
// actually healthy.
|
||||
const startedAt = Date.now();
|
||||
let appStatus = 'unknown';
|
||||
while (Date.now() - startedAt < healthTimeoutMs) {
|
||||
try {
|
||||
const probe = await runOnCoolifyHost(
|
||||
`docker ps --filter name=${uuid} --format '{{.Names}}'`,
|
||||
{ timeoutMs: 8_000 },
|
||||
);
|
||||
if (probe.stdout.trim().length > 0) {
|
||||
// Coolify started the stack. Even on this happy path we still
|
||||
// need to ensure the proxy-network attachment ran, since
|
||||
// Coolify only attaches at the end of its full deploy
|
||||
// pipeline (which can be skipped if a sidecar fails to come
|
||||
// up). Idempotent — already-attached containers are no-ops.
|
||||
await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ });
|
||||
return { started: true, startMethod: 'coolify-queue', diag: '' };
|
||||
const svc = (await getService(uuid)) as unknown as {
|
||||
applications?: Array<{ name?: string; status?: string }>;
|
||||
};
|
||||
const apps = svc.applications ?? [];
|
||||
const target = apps.find(a => a.name === publicAppName) ?? apps[0];
|
||||
appStatus = target?.status ?? 'unknown';
|
||||
if (/^running:healthy/i.test(appStatus)) break;
|
||||
// Failure modes Coolify reports as terminal: exited (compose
|
||||
// never ran), restarting (boot loop). We don't want to wait
|
||||
// the full timeout in those cases.
|
||||
if (/^exited/i.test(appStatus) && Date.now() - startedAt > 90_000) {
|
||||
// Give it 90s to transition out of "exited" before declaring failure
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('[ensureServiceUp] probe failed', e);
|
||||
console.warn('[ensureServiceReachable] status probe failed', e);
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 8_000));
|
||||
}
|
||||
|
||||
const started = /^running/i.test(appStatus);
|
||||
if (!started) {
|
||||
return {
|
||||
started: false,
|
||||
reachable: false,
|
||||
appStatus,
|
||||
postDeploy: null,
|
||||
diag: `Public app "${publicAppName}" did not become healthy within ${Math.round(healthTimeoutMs/1000)}s (status=${appStatus}). Use apps.containers.ps and apps.logs to diagnose.`,
|
||||
};
|
||||
}
|
||||
|
||||
// Apply post-deploy fixes. Only meaningful when SSH is configured —
|
||||
// without it we can't rewrite the .env or attach proxy networks.
|
||||
let postDeploy: CoolifyPostDeployResult | null = null;
|
||||
if (isCoolifySshConfigured()) {
|
||||
try {
|
||||
postDeploy = await applyCoolifyPostDeployFixes({ uuid, fqdn, publicAppName, port });
|
||||
} catch (e) {
|
||||
console.warn('[ensureServiceReachable] post-deploy fix failed', e);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Fallback — run docker compose up -d ourselves
|
||||
let composeDiag = '';
|
||||
// Best-effort reachability probe. Public DNS for the workspace
|
||||
// wildcard may not have propagated yet (esp. on first deploy in a
|
||||
// brand-new workspace), so a non-200 here doesn't mean failure —
|
||||
// it just means "agents should retry the URL in a few seconds".
|
||||
let reachable = false;
|
||||
let probeDiag = '';
|
||||
try {
|
||||
const r = await composeUp('service', uuid, { timeoutMs: 600_000 });
|
||||
// Strip ANSI / control chars (compose progress output uses \r and
|
||||
// ANSI escapes) so the diag survives JSON serialization cleanly.
|
||||
composeDiag = (r.stderr || r.stdout)
|
||||
.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '')
|
||||
.replace(/[\x00-\x08\x0B-\x1F]/g, '')
|
||||
.trim()
|
||||
.slice(-400);
|
||||
if (r.code === 0) {
|
||||
return { started: true, startMethod: 'compose-up', diag: '' };
|
||||
}
|
||||
// Non-zero exit DOES NOT mean nothing started. Compose returns
|
||||
// non-zero whenever any service hits a `depends_on:
|
||||
// condition: service_healthy` timeout — common for sidecar
|
||||
// containers (workers, schedulers) of apps with slow-booting
|
||||
// primary services (Twenty's worker waits on twenty's healthcheck,
|
||||
// which takes 2-5 min). Probe the host to see what's actually
|
||||
// running before declaring failure.
|
||||
const url = `https://${fqdn}`;
|
||||
const ctrl = new AbortController();
|
||||
const t = setTimeout(() => ctrl.abort(), 12_000);
|
||||
const res = await fetch(url, { signal: ctrl.signal, redirect: 'manual' });
|
||||
clearTimeout(t);
|
||||
reachable = res.status >= 200 && res.status < 400;
|
||||
probeDiag = `GET ${url} → ${res.status}`;
|
||||
} catch (e) {
|
||||
composeDiag = e instanceof Error ? e.message : String(e);
|
||||
probeDiag = `GET probe failed: ${e instanceof Error ? e.message : String(e)}`;
|
||||
}
|
||||
|
||||
try {
|
||||
const probe = await runOnCoolifyHost(
|
||||
`docker ps --filter name=${uuid} --format '{{.Names}}'`,
|
||||
{ timeoutMs: 8_000 },
|
||||
);
|
||||
if (probe.stdout.trim().length > 0) {
|
||||
// Something IS running — partial success. Surface the diag so
|
||||
// agents see WHY compose returned non-zero (usually a sidecar
|
||||
// depends_on timeout) but report started=true so happy-path
|
||||
// workflows don't panic. composeUp already attached the proxy
|
||||
// network, but call once more to cover any container that came
|
||||
// up after the initial attach pass.
|
||||
await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ });
|
||||
return { started: true, startMethod: 'compose-up', diag: composeDiag };
|
||||
}
|
||||
} catch { /* fall through */ }
|
||||
|
||||
return { started: false, startMethod: 'failed', diag: composeDiag };
|
||||
return {
|
||||
started: true,
|
||||
reachable,
|
||||
appStatus,
|
||||
postDeploy,
|
||||
diag: probeDiag,
|
||||
};
|
||||
}
|
||||
|
||||
/** Resolve fqdn from params.domain or auto-generate. Returns NextResponse on policy error. */
|
||||
|
||||
@@ -1,31 +1,45 @@
|
||||
/**
|
||||
* Bring a Coolify Service or compose Application up via raw
|
||||
* `docker compose up -d`.
|
||||
* Surgical post-deploy fixes for Coolify-managed Services.
|
||||
*
|
||||
* Why this exists
|
||||
* ---------------
|
||||
* Coolify's `POST /services/{uuid}/start` and `POST /deploy` endpoints
|
||||
* write the rendered docker-compose.yml + .env to
|
||||
* `/data/coolify/services/{uuid}/` (or `applications/{uuid}/` for
|
||||
* compose apps), then enqueue a Laravel job to run
|
||||
* `docker compose up -d`. In practice that worker queue is unreliable:
|
||||
* it routinely returns "Service starting request queued" and then
|
||||
* never actually invokes docker compose. The user's stack just sits
|
||||
* there with rendered files and no containers.
|
||||
* Coolify's service-template deploy pipeline gets us 90% of the way:
|
||||
* it generates a docker-compose.yml + .env, runs `docker compose up`,
|
||||
* sets up volumes, and writes Traefik labels. But for many templates
|
||||
* (including the popular twenty/n8n/ghost/etc.) it consistently fails
|
||||
* to do three host-level things that the public REST API does NOT
|
||||
* expose:
|
||||
*
|
||||
* For a hands-off SaaS we can't ship that experience. This helper
|
||||
* does the work directly via SSH, so a single MCP `apps.create` call
|
||||
* really does leave a running app.
|
||||
* 1. Rewrite the auto-generated `SERVICE_FQDN_*` / `SERVICE_URL_*`
|
||||
* env vars from sslip.io defaults to the user's real FQDN. The
|
||||
* user's domain is correctly stored on `service.applications[].fqdn`
|
||||
* (so Traefik routing rules use it), but the env vars that the
|
||||
* app embeds into its frontend bundle (e.g. Twenty's SERVER_URL)
|
||||
* keep pointing at sslip.io. Result: SPA loads on real HTTPS
|
||||
* then makes XHRs to insecure sslip.io URLs → "Mixed Content"
|
||||
* errors and the app appears broken.
|
||||
*
|
||||
* 2. Generate the `traefik.http.services.<svc>.loadbalancer.server.port`
|
||||
* label. Without it Traefik logs `error: port is missing` and
|
||||
* returns 503 on every request.
|
||||
*
|
||||
* 3. Connect `coolify-proxy` to the resource's project network.
|
||||
* Coolify generates a label `caddy_ingress_network=<uuid>`
|
||||
* hinting that the proxy SHOULD live there, but never actually
|
||||
* runs `docker network connect`. Result: even if Traefik
|
||||
* discovers the right routing rules, it can't reach the upstream
|
||||
* container.
|
||||
*
|
||||
* This module fixes all three after Coolify's queue finishes its work.
|
||||
*
|
||||
* Permissions model
|
||||
* -----------------
|
||||
* The `vibn-logs` SSH user (created by deploy/setup-coolify-ssh.sh)
|
||||
* is in the `docker` group but has no shell sudo. It also can't read
|
||||
* `/data/coolify/services/` directly because Coolify chmods that to
|
||||
* 700 root. We work around both constraints by running the docker
|
||||
* CLI inside a one-shot container that bind-mounts the path. The
|
||||
* docker daemon runs as root so it can read the directory; the
|
||||
* `vibn-logs` user only needs `docker` socket access.
|
||||
* The `vibn-logs` SSH user has docker-group membership but no shell
|
||||
* sudo and no read access to `/data/coolify/services/<uuid>/` (Coolify
|
||||
* chmods that to 0700 root). We work around both by running a one-shot
|
||||
* `python:alpine` container that bind-mounts the path. The docker
|
||||
* daemon runs as root so it can read the directory; vibn-logs only
|
||||
* needs the docker socket.
|
||||
*/
|
||||
|
||||
import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh';
|
||||
@@ -34,7 +48,6 @@ import { runOnCoolifyHost, type CoolifySshResult } from './coolify-ssh';
|
||||
export type ResourceKind = 'service' | 'application';
|
||||
|
||||
function composeDir(kind: ResourceKind, uuid: string): string {
|
||||
// Coolify v4 path layout — these are stable across the v4 line.
|
||||
return kind === 'service'
|
||||
? `/data/coolify/services/${uuid}`
|
||||
: `/data/coolify/applications/${uuid}`;
|
||||
@@ -45,15 +58,18 @@ function sq(s: string): string {
|
||||
return `'${String(s).replace(/'/g, `'\\''`)}'`;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// Manual recovery helpers (apps.containers.up / .ps)
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Run a `docker compose` subcommand inside the rendered compose
|
||||
* directory using a one-shot `docker:cli` container. Falls back to
|
||||
* pulling the image on the first call.
|
||||
* Run a `docker compose` subcommand inside the rendered compose dir
|
||||
* via a one-shot `docker:cli` container. Used by `apps.containers.up`
|
||||
* and `apps.containers.ps` for manual user recovery.
|
||||
*
|
||||
* The `docker:cli` image (~50MB) is the official Docker CLI without
|
||||
* the daemon. By bind-mounting the host docker socket it talks to
|
||||
* the host's daemon, so containers it creates are first-class
|
||||
* children of the same Docker engine — exactly what we want.
|
||||
* Note: the *deployment* path (apps.create) no longer uses this
|
||||
* helper. apps.create lets Coolify's own queue do the deploy, then
|
||||
* applies the post-deploy fixes via `applyCoolifyPostDeployFixes`.
|
||||
*/
|
||||
async function composeRun(
|
||||
kind: ResourceKind,
|
||||
@@ -62,8 +78,6 @@ async function composeRun(
|
||||
opts: { timeoutMs?: number } = {},
|
||||
): Promise<CoolifySshResult> {
|
||||
const dir = composeDir(kind, uuid);
|
||||
// Use --workdir + bind-mount so docker compose finds compose.yml + .env
|
||||
// automatically. The `--rm` cleans the helper container after each call.
|
||||
const cmd = [
|
||||
'docker', 'run', '--rm',
|
||||
'-v', sq(`${dir}:/work`),
|
||||
@@ -76,72 +90,13 @@ async function composeRun(
|
||||
return runOnCoolifyHost(cmd, { timeoutMs: opts.timeoutMs ?? 600_000, maxBytes: 2_000_000 });
|
||||
}
|
||||
|
||||
/**
|
||||
* `docker compose up -d` for a Coolify service or compose app.
|
||||
*
|
||||
* Idempotent — Compose already-running containers are no-op'd.
|
||||
* Returns the raw SSH result so callers can surface diagnostics on
|
||||
* failure (most common: image-pull errors, port conflicts).
|
||||
*
|
||||
* After compose succeeds we also attach every stack container to the
|
||||
* `coolify` proxy network. Coolify's UI-driven deploy does this as a
|
||||
* post-step so Traefik can route public traffic to the container, but
|
||||
* the rendered compose file only declares the service-private network.
|
||||
* If we skip this step the stack runs fine on its own bridge but
|
||||
* `crm.mark.vibnai.com` returns "no available server" from Traefik.
|
||||
*/
|
||||
/** `docker compose up -d` — exposed as `apps.containers.up` for manual user recovery. */
|
||||
export async function composeUp(
|
||||
kind: ResourceKind,
|
||||
uuid: string,
|
||||
opts: { timeoutMs?: number } = {},
|
||||
): Promise<CoolifySshResult> {
|
||||
const r = await composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts);
|
||||
// Best-effort: attach to the proxy network even if compose returned
|
||||
// non-zero (sidecar `depends_on` timeouts still leave primary
|
||||
// containers running, and we want them reachable).
|
||||
await attachToCoolifyProxyNetwork(uuid).catch(() => { /* swallow */ });
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach the public-facing containers of a Coolify resource to the
|
||||
* `coolify` proxy network so Traefik can reach them.
|
||||
*
|
||||
* IMPORTANT: only attach containers that have Traefik labels. The
|
||||
* coolify network is shared across the whole platform (it hosts
|
||||
* coolify-db, coolify-redis, etc.) and Docker's embedded DNS resolves
|
||||
* unqualified hostnames like `postgres` and `redis` to the FIRST
|
||||
* container with that name on the network. If we attach Twenty's
|
||||
* `postgres-<uuid>` container to coolify, Twenty's
|
||||
* `postgres://postgres:5432/...` connection string starts resolving
|
||||
* to `coolify-db` instead, which fails auth (different password).
|
||||
*
|
||||
* Coolify's own deploy pipeline does the same selective attach — only
|
||||
* the proxied container goes on the proxy network. Idempotent —
|
||||
* already-attached containers are no-ops.
|
||||
*/
|
||||
export async function attachToCoolifyProxyNetwork(
|
||||
uuid: string,
|
||||
): Promise<void> {
|
||||
// List running containers on the resource's project network with
|
||||
// their `traefik.enable` label. Only those with `traefik.enable=true`
|
||||
// need to be reachable by the proxy.
|
||||
const ls = await runOnCoolifyHost(
|
||||
`docker ps --filter network=${uuid} --format '{{.Names}}|{{.Label "traefik.enable"}}'`,
|
||||
{ timeoutMs: 10_000 },
|
||||
);
|
||||
const names = ls.stdout
|
||||
.split('\n')
|
||||
.map(s => s.trim())
|
||||
.filter(Boolean)
|
||||
.filter(line => line.endsWith('|true'))
|
||||
.map(line => line.split('|')[0]);
|
||||
if (names.length === 0) return;
|
||||
// Attach each one. `|| true` so already-connected returns 0.
|
||||
const attaches = names.map(n =>
|
||||
`docker network connect coolify ${sq(n)} 2>/dev/null || true`,
|
||||
).join(' && ');
|
||||
await runOnCoolifyHost(attaches, { timeoutMs: 30_000 });
|
||||
return composeRun(kind, uuid, ['up', '-d', '--remove-orphans'], opts);
|
||||
}
|
||||
|
||||
/** `docker compose down` — stops + removes containers; volumes preserved. */
|
||||
@@ -153,7 +108,7 @@ export async function composeDown(
|
||||
return composeRun(kind, uuid, ['down'], opts);
|
||||
}
|
||||
|
||||
/** `docker compose ps -a` — useful for diagnosing why up didn't yield healthy containers. */
|
||||
/** `docker compose ps -a` — exposed as `apps.containers.ps`. */
|
||||
export async function composePs(
|
||||
kind: ResourceKind,
|
||||
uuid: string,
|
||||
@@ -162,18 +117,275 @@ export async function composePs(
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify the rendered compose dir exists before trying to run docker
|
||||
* compose against it. Returns a friendly null-on-missing instead of
|
||||
* an opaque ENOENT.
|
||||
* Verify the rendered compose dir exists. Returns a friendly
|
||||
* null-on-missing instead of an opaque ENOENT.
|
||||
*/
|
||||
export async function composeDirExists(
|
||||
kind: ResourceKind,
|
||||
uuid: string,
|
||||
): Promise<boolean> {
|
||||
// We can't `ls` the dir directly (perm denied), but a docker bind-mount
|
||||
// probe will fail-closed if the path is missing.
|
||||
const dir = composeDir(kind, uuid);
|
||||
const cmd = `docker run --rm -v ${sq(`${dir}:/w`)} alpine sh -c 'test -f /w/docker-compose.yml && echo OK || echo MISSING'`;
|
||||
const r = await runOnCoolifyHost(cmd, { timeoutMs: 30_000 });
|
||||
return r.stdout.trim().endsWith('OK');
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// Post-deploy fixes (apps.create's reliability layer)
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
export interface CoolifyPostDeployOptions {
|
||||
/** Coolify service UUID. */
|
||||
uuid: string;
|
||||
/** Real custom FQDN, e.g. "crm.mark.vibnai.com" — must NOT include scheme. */
|
||||
fqdn: string;
|
||||
/** Compose service name of the user-facing app, e.g. "twenty". */
|
||||
publicAppName: string;
|
||||
/**
|
||||
* HTTP port the public app listens on inside the container.
|
||||
* If omitted, we try to detect it from `.env` (looking for
|
||||
* `SERVICE_FQDN_<APP>_<PORT>`). Falls back to 3000.
|
||||
*/
|
||||
port?: number;
|
||||
}
|
||||
|
||||
export interface CoolifyPostDeployResult {
|
||||
ok: boolean;
|
||||
steps: {
|
||||
envRewrite: { ok: boolean; detail: string };
|
||||
portLabel: { ok: boolean; detail: string };
|
||||
proxyNetwork: { ok: boolean; detail: string };
|
||||
recreate: { ok: boolean; detail: string };
|
||||
proxyRestart: { ok: boolean; detail: string };
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed a Python script (UTF-8 bytes, base64-encoded) as a here-doc
|
||||
* arg to a docker-run that mounts the resource's compose dir at /work
|
||||
* and exposes the inputs as env vars. We use base64 to sidestep all
|
||||
* shell-escaping issues with python triple-quoted strings.
|
||||
*/
|
||||
function buildPythonRunner(script: string, env: Record<string, string>, dir: string, networkAttach = false): string {
|
||||
const b64 = Buffer.from(script, 'utf8').toString('base64');
|
||||
const envFlags = Object.entries(env)
|
||||
.map(([k, v]) => `-e ${sq(`${k}=${v}`)}`)
|
||||
.join(' ');
|
||||
// We need a Python image with sed-style file editing. python:3-alpine
|
||||
// is ~50MB and ships with regex + os out of the box.
|
||||
return [
|
||||
`echo ${sq(b64)} | base64 -d |`,
|
||||
'docker run --rm -i',
|
||||
`-v ${sq(`${dir}:/work`)}`,
|
||||
networkAttach ? '-v /var/run/docker.sock:/var/run/docker.sock' : '',
|
||||
envFlags,
|
||||
'python:3-alpine',
|
||||
'python -',
|
||||
].filter(Boolean).join(' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the three post-deploy fixes to a freshly-deployed Coolify
|
||||
* service so the user-facing URL works on the very first hit.
|
||||
*
|
||||
* Idempotent. Safe to call multiple times — each step detects
|
||||
* whether the change is already in place and no-ops if so.
|
||||
*
|
||||
* Sequencing:
|
||||
* 1. Rewrite .env's SERVICE_FQDN_* / SERVICE_URL_* (cosmetic for
|
||||
* Traefik but critical for any frontend that bakes the URL into
|
||||
* its bundle from these env vars at startup).
|
||||
* 2. Inject the missing `loadbalancer.server.port` label into the
|
||||
* compose file.
|
||||
* 3. Connect coolify-proxy to the project network so Traefik can
|
||||
* reach the public container by its compose name.
|
||||
* 4. `docker compose up -d --force-recreate <publicAppName>` — this
|
||||
* applies the new env (step 1) and label (step 2) without
|
||||
* touching internal services like postgres/redis (which would
|
||||
* cause DNS collisions if their networks changed).
|
||||
* 5. `docker restart coolify-proxy` so Traefik re-discovers the
|
||||
* newly-attached network and the recreated container's labels.
|
||||
*/
|
||||
export async function applyCoolifyPostDeployFixes(
|
||||
opts: CoolifyPostDeployOptions,
|
||||
): Promise<CoolifyPostDeployResult> {
|
||||
const { uuid, fqdn, publicAppName, port = 3000 } = opts;
|
||||
const dir = composeDir('service', uuid);
|
||||
|
||||
const result: CoolifyPostDeployResult = {
|
||||
ok: false,
|
||||
steps: {
|
||||
envRewrite: { ok: false, detail: '' },
|
||||
portLabel: { ok: false, detail: '' },
|
||||
proxyNetwork: { ok: false, detail: '' },
|
||||
recreate: { ok: false, detail: '' },
|
||||
proxyRestart: { ok: false, detail: '' },
|
||||
},
|
||||
};
|
||||
|
||||
// ── Step 1+2 fused: rewrite .env + inject port label in one Python pass
|
||||
const editorScript = `
|
||||
import os, re, sys
|
||||
|
||||
env_file = "/work/.env"
|
||||
compose_file = "/work/docker-compose.yml"
|
||||
fqdn = os.environ["NEW_FQDN"]
|
||||
app = os.environ["APP"] # e.g. "twenty"
|
||||
APP = app.upper()
|
||||
uuid = os.environ["UUID"]
|
||||
port = os.environ["PORT"]
|
||||
|
||||
env_changes = []
|
||||
if os.path.exists(env_file):
|
||||
with open(env_file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
out = []
|
||||
for line in lines:
|
||||
new = line
|
||||
# SERVICE_FQDN_<APP>=<host>
|
||||
if re.match(rf"^SERVICE_FQDN_{re.escape(APP)}=", line):
|
||||
new = f"SERVICE_FQDN_{APP}={fqdn}\\n"
|
||||
# SERVICE_URL_<APP>=<scheme://host>
|
||||
elif re.match(rf"^SERVICE_URL_{re.escape(APP)}=", line):
|
||||
new = f"SERVICE_URL_{APP}=https://{fqdn}\\n"
|
||||
else:
|
||||
m = re.match(rf"^SERVICE_FQDN_{re.escape(APP)}_(\\d+)=", line)
|
||||
if m:
|
||||
new = f"SERVICE_FQDN_{APP}_{m.group(1)}={fqdn}:{m.group(1)}\\n"
|
||||
else:
|
||||
m = re.match(rf"^SERVICE_URL_{re.escape(APP)}_(\\d+)=", line)
|
||||
if m:
|
||||
new = f"SERVICE_URL_{APP}_{m.group(1)}=https://{fqdn}:{m.group(1)}\\n"
|
||||
if new != line:
|
||||
env_changes.append(line.strip() + " => " + new.strip())
|
||||
out.append(new)
|
||||
with open(env_file, "w", encoding="utf-8") as f:
|
||||
f.writelines(out)
|
||||
|
||||
# Inject port label into compose if missing.
|
||||
label_changes = []
|
||||
svc_id = f"{app}-svc-{uuid}"
|
||||
needed_router_svc = f"traefik.http.routers.https-0-{uuid}-{app}.service={svc_id}"
|
||||
needed_loadbalance = f"traefik.http.services.{svc_id}.loadbalancer.server.port={port}"
|
||||
http_router_svc = f"traefik.http.routers.http-0-{uuid}-{app}.service={svc_id}"
|
||||
|
||||
with open(compose_file, "r", encoding="utf-8") as f:
|
||||
s = f.read()
|
||||
|
||||
if needed_loadbalance not in s:
|
||||
# Anchor: the existing tls=true label for the https router.
|
||||
anchor = f"traefik.http.routers.https-0-{uuid}-{app}.tls=true"
|
||||
if anchor in s:
|
||||
replacement = (
|
||||
anchor
|
||||
+ "\\n - " + http_router_svc
|
||||
+ "\\n - " + needed_router_svc
|
||||
+ "\\n - " + needed_loadbalance
|
||||
)
|
||||
s = s.replace(anchor, replacement, 1) # only on the twenty service block
|
||||
with open(compose_file, "w", encoding="utf-8") as f:
|
||||
f.write(s)
|
||||
label_changes.append(f"injected loadbalancer.server.port={port}")
|
||||
else:
|
||||
label_changes.append(f"WARN: anchor '{anchor}' not found; label NOT injected")
|
||||
else:
|
||||
label_changes.append("loadbalancer.server.port already present")
|
||||
|
||||
print("ENV_CHANGES:" + str(len(env_changes)))
|
||||
for c in env_changes:
|
||||
print(" " + c)
|
||||
print("LABEL_CHANGES:")
|
||||
for c in label_changes:
|
||||
print(" " + c)
|
||||
`;
|
||||
|
||||
try {
|
||||
const cmd = buildPythonRunner(
|
||||
editorScript,
|
||||
{ NEW_FQDN: fqdn, APP: publicAppName, UUID: uuid, PORT: String(port) },
|
||||
dir,
|
||||
);
|
||||
const r = await runOnCoolifyHost(cmd, { timeoutMs: 60_000 });
|
||||
if (r.code === 0) {
|
||||
const text = r.stdout.trim().slice(-1500);
|
||||
result.steps.envRewrite = { ok: true, detail: text };
|
||||
result.steps.portLabel = { ok: !text.includes('WARN:'), detail: text };
|
||||
} else {
|
||||
const detail = (r.stderr || r.stdout).trim().slice(-500);
|
||||
result.steps.envRewrite = { ok: false, detail };
|
||||
result.steps.portLabel = { ok: false, detail };
|
||||
}
|
||||
} catch (e) {
|
||||
const detail = e instanceof Error ? e.message : String(e);
|
||||
result.steps.envRewrite = { ok: false, detail };
|
||||
result.steps.portLabel = { ok: false, detail };
|
||||
}
|
||||
|
||||
// ── Step 3: attach coolify-proxy to project network
|
||||
try {
|
||||
// `|| true` swallows the "endpoint with name coolify-proxy already
|
||||
// exists in network" error which is the success-already-applied case.
|
||||
const r = await runOnCoolifyHost(
|
||||
`docker network connect ${sq(uuid)} coolify-proxy 2>&1 || true`,
|
||||
{ timeoutMs: 10_000 },
|
||||
);
|
||||
const text = (r.stdout || r.stderr).trim();
|
||||
const alreadyAttached = /already exists/i.test(text);
|
||||
result.steps.proxyNetwork = {
|
||||
ok: true,
|
||||
detail: alreadyAttached ? 'already attached' : (text || 'attached'),
|
||||
};
|
||||
} catch (e) {
|
||||
result.steps.proxyNetwork = {
|
||||
ok: false,
|
||||
detail: e instanceof Error ? e.message : String(e),
|
||||
};
|
||||
}
|
||||
|
||||
// ── Step 4: recreate ONLY the public app to apply env+label changes
|
||||
// (not the whole stack — postgres/redis/worker stay where they are)
|
||||
try {
|
||||
const r = await composeRun('service', uuid, ['up', '-d', '--force-recreate', publicAppName], {
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
const detail = (r.stderr || r.stdout)
|
||||
.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '')
|
||||
.replace(/[\x00-\x08\x0B-\x1F]/g, '')
|
||||
.trim()
|
||||
.slice(-400);
|
||||
// compose returns 0 on success, non-zero on partial failure;
|
||||
// sidecar `depends_on` timeouts can produce a non-zero exit
|
||||
// even though the public container started successfully.
|
||||
const probe = await runOnCoolifyHost(
|
||||
`docker ps --filter name=${publicAppName}-${uuid} --format '{{.Names}}'`,
|
||||
{ timeoutMs: 8_000 },
|
||||
);
|
||||
const running = probe.stdout.trim().length > 0;
|
||||
result.steps.recreate = {
|
||||
ok: running,
|
||||
detail: running ? `${publicAppName}-${uuid} running` : detail,
|
||||
};
|
||||
} catch (e) {
|
||||
result.steps.recreate = {
|
||||
ok: false,
|
||||
detail: e instanceof Error ? e.message : String(e),
|
||||
};
|
||||
}
|
||||
|
||||
// ── Step 5: nudge Traefik to re-discover via proxy restart
|
||||
try {
|
||||
const r = await runOnCoolifyHost(`docker restart coolify-proxy`, { timeoutMs: 30_000 });
|
||||
result.steps.proxyRestart = {
|
||||
ok: r.code === 0,
|
||||
detail: r.code === 0 ? 'restarted' : (r.stderr || r.stdout).trim().slice(-200),
|
||||
};
|
||||
} catch (e) {
|
||||
result.steps.proxyRestart = {
|
||||
ok: false,
|
||||
detail: e instanceof Error ? e.message : String(e),
|
||||
};
|
||||
}
|
||||
|
||||
result.ok = Object.values(result.steps).every(s => s.ok);
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user