diff --git a/lib/ai/error-recovery.ts b/lib/ai/error-recovery.ts index a1d20532..d969e533 100644 --- a/lib/ai/error-recovery.ts +++ b/lib/ai/error-recovery.ts @@ -79,6 +79,20 @@ const RULES: RecoveryRule[] = [ antipattern: 'Do NOT keep retrying `devcontainer_ensure` or `projects.create` blindly. The cap is real until something is freed up. Do not try to bypass it by switching workspaces or projects.', }, + { + id: 'devcontainer-still-provisioning', + // Matches the JSON returned by devcontainer.status when the row is + // still in 'provisioning' state. The status tool now self-heals + // via a `true` exec probe, so seeing this means the probe failed + // (container not yet up) — keep waiting OR escalate. + pattern: /"state"\s*:\s*"provisioning"/, + diagnosis: + 'The dev container is still booting. devcontainer.status already tried a liveness probe and the container did not respond yet. First-boot for a brand-new project takes 15-45s; image-pull failures take longer to surface as `likelyFailed: true`.', + requiredAction: + 'If `ageSeconds < 60` and `likelyFailed` is not set: send the user ONE status message ("Spinning up your environment, this takes ~30s on first boot...") and wait. Do NOT poll devcontainer.status more than once every 15 seconds, and never more than 3 times in a row. After the wait, call `shell.exec { command: "echo ready" }` instead of `devcontainer.status` — shell.exec lazy-provisions and will return the moment the container is reachable, which is the actual signal you need. If `likelyFailed: true` (ageSeconds > 120): surface the failure to the user with the project id and stop polling.', + antipattern: + 'Do NOT call `devcontainer.status` repeatedly in a tight loop. Status is a read; it does not boot anything. Polling it back-to-back wastes turns and shows the user a wall of identical "still provisioning" messages.', + }, { id: 'port-already-allocated', // Matches: `port is already allocated` / `bind: address already in use`. diff --git a/lib/dev-container.ts b/lib/dev-container.ts index dabf639b..88b01082 100644 --- a/lib/dev-container.ts +++ b/lib/dev-container.ts @@ -434,12 +434,70 @@ export async function getDevContainerStatus(projectId: string): Promise<{ exists: boolean; state: DevContainerRow['state'] | 'absent'; serviceUuid: string | null; + /** Seconds since the row was created; useful for AI to decide whether to keep polling. */ + ageSeconds?: number; + /** Set when state was just self-healed by this call. */ + selfHealed?: boolean; + /** Set when state is stuck in provisioning past the grace window (likely failed). */ + likelyFailed?: boolean; }> { const row = await getDevContainerRow(projectId); if (!row) return { exists: false, state: 'absent', serviceUuid: null }; - // Optional: poke Coolify for fresh state. Skipped for now to keep this - // hot path cheap; consumers that care can call getService(uuid) directly. - return { exists: true, state: row.state, serviceUuid: row.service_uuid }; + + const ageMs = Date.now() - row.created_at.getTime(); + const ageSeconds = Math.floor(ageMs / 1000); + + // If we already think it's running or suspended, return as-is. The + // touchActivity() call inside execInDevContainer keeps the row honest. + if (row.state !== 'provisioning') { + return { exists: true, state: row.state, serviceUuid: row.service_uuid, ageSeconds }; + } + + // State is 'provisioning'. The naive read-only return here used to + // create a deadlock: the AI polls status forever waiting for a flip + // that only happens via execInDevContainer. So instead, probe with + // a cheap `true` exec. If it succeeds, mark running and return. + // Coolify's service status alone isn't enough — Coolify reports + // 'running:unknown' for any service without a healthcheck/fqdn, + // which is every dev container. The exec is the source of truth. + if (isCoolifySshConfigured()) { + try { + const probe = await execInCoolifyApp({ + appUuid: row.service_uuid, + service: 'vibn-dev', + command: 'true', + user: 'vibn', + timeoutMs: 5_000, + }); + if (probe.exitCode === 0) { + await touchActivity(projectId); + return { + exists: true, + state: 'running', + serviceUuid: row.service_uuid, + ageSeconds, + selfHealed: true, + }; + } + } catch { + // Exec failed — container probably not yet up. Fall through + // to age-based likelyFailed heuristic. + } + } + + // If we've been "provisioning" for >120s, the container is almost + // certainly stuck (image pull failure, scheduling failure, etc.). + // Surface that distinct from "still booting" so the AI can stop + // polling and tell the user instead of looping. + const likelyFailed = ageSeconds > 120; + + return { + exists: true, + state: row.state, + serviceUuid: row.service_uuid, + ageSeconds, + likelyFailed, + }; } // Re-export getService so route handlers can pull live Coolify status