master-ai/vibn-frontend/lib/ai/error-recovery.ts

/**
 * Tool-error recovery middleware.
 *
 * Pattern-matches known-recoverable error strings in MCP tool results
 * and produces a synthetic system message instructing the model on the
 * exact recovery action. Injected into the conversation before the
 * next model round.
 *
 * Why this exists (vs just a system-prompt rule):
 * Static prompt rules against accumulating tool reality lose. We've
 * shipped 4 orphan twenty-* services because the model kept doing
 * delete-and-recreate even though the prompt told it not to. The
 * model treats prompt rules as soft guidance; it cannot ignore a
 * fresh `role: "system"` message that arrives between tool result
 * and next call. See AI_HARNESS_GAPS.md §1 for the full case.
 *
 * Adding a rule:
 *   1. Pick a regex that matches the error string with NO false
 *      positives. If it could fire on a legitimate success or
 *      unrelated failure, leave it out — silent miss > wrong fix.
 *   2. Write the `diagnosis` as a sentence the model can use as-is
 *      in a status update to the user.
 *   3. Write `requiredAction` as the literal next tool call(s) the
 *      model should make, with arg shapes if non-obvious.
 *   4. Write `antipattern` as the wrong-but-tempting move the model
 *      keeps doing. The injected message tells it explicitly NOT
 *      to do this.
 *
 * Rules are checked in registration order. First match wins.
 */

export interface RecoveryRule {
  /** Stable identifier for logs / future telemetry. */
  id: string;
  /** Pattern that uniquely identifies this error in tool output. */
  pattern: RegExp;
  /** Human-readable explanation of what went wrong. */
  diagnosis: string;
  /** Exact next tool call(s) the model should make. */
  requiredAction: string;
  /** The wrong move the model keeps making for this error. */
  antipattern: string;
}

const RULES: RecoveryRule[] = [
  {
    id: 'orphan-container-conflict',
    // Matches: `Conflict. The container name "/postgres-..." is already in use`
    // Real prod example, twenty-crm thread, 2026-04-30.
    pattern: /Conflict\.\s+The container name\s+["/]?[\w./-]+["/]?\s+is already in use/i,
    diagnosis:
      'A previous deploy left an orphan Docker container holding this service\'s container name. The new boot is colliding with the orphan. This is a recoverable state.',
    requiredAction:
      'Call `apps_unstick { uuid }` against the SAME app uuid you were just trying to deploy, then `apps_deploy { uuid }`. Both calls use the existing uuid; do not create a new app.',
    antipattern:
      'Do NOT delete the failing app and create a new one with a different name. That keeps the orphan running, doubles the stack, and ships another shadow service. We have shipped 4 orphan twenty-* services this way before. Do not repeat it.',
  },
  {
    id: 'image-pull-denied',
    // Matches: `pull access denied for ...` and `manifest unknown` from the registry.
    pattern: /(pull access denied for|manifest unknown|repository does not exist)/i,
    diagnosis:
      'The Docker image referenced by this app is not on the host, and the registry pull failed (private repo, missing credentials, or wrong tag).',
    requiredAction:
      'Call `apps_repair { uuid }` to re-attempt the post-deploy fixes. If that fails too, surface the exact image reference to the user and ask whether the image should be pulled from a different registry or rebuilt.',
    antipattern:
      'Do NOT retry the same `apps_deploy` blindly hoping the registry will respond differently. The pull failure is persistent until the underlying image-availability issue is fixed.',
  },
  {
    id: 'workspace-quota-exceeded',
    // Matches the structured 402 returned by quotas.ts. The substring
    // "QUOTA_EXCEEDED" (the .code field) plus "active dev containers"
    // or "active projects" disambiguates from arbitrary text.
    pattern: /(QUOTA_EXCEEDED.*active (dev containers|projects)|already has \d+\/\d+ active (dev containers|projects))/i,
    diagnosis:
      'The workspace has hit its soft cap on active resources. This is a beta-limit guardrail, not a real error.',
    requiredAction:
      'Tell the user clearly which cap was hit and offer the two options: (1) suspend an existing dev container with `devcontainer_suspend { projectId }` if they have an idle one, or delete an unused project, OR (2) email support@vibnai.com to raise their cap. Do NOT retry the same call expecting a different result.',
    antipattern:
      'Do NOT keep retrying `devcontainer_ensure` or `projects.create` blindly. The cap is real until something is freed up. Do not try to bypass it by switching workspaces or projects.',
  },
  {
    id: 'devcontainer-still-provisioning',
    // Matches the JSON returned by devcontainer.status when the row is
    // still in 'provisioning' state. The status tool now self-heals
    // via a `true` exec probe, so seeing this means the probe failed
    // (container not yet up) — keep waiting OR escalate.
    pattern: /"state"\s*:\s*"provisioning"/,
    diagnosis:
      'The dev container is still booting. devcontainer.status already tried a liveness probe and the container did not respond yet. First-boot for a brand-new project takes 15-45s; image-pull failures take longer to surface as `likelyFailed: true`.',
    requiredAction:
      'If `ageSeconds < 60` and `likelyFailed` is not set: send the user ONE status message ("Spinning up your environment, this takes ~30s on first boot...") and wait. Do NOT poll devcontainer.status more than once every 15 seconds, and never more than 3 times in a row. After the wait, call `shell.exec { command: "echo ready" }` instead of `devcontainer.status` — shell.exec lazy-provisions and will return the moment the container is reachable, which is the actual signal you need. If `likelyFailed: true` (ageSeconds > 120): surface the failure to the user with the project id and stop polling.',
    antipattern:
      'Do NOT call `devcontainer.status` repeatedly in a tight loop. Status is a read; it does not boot anything. Polling it back-to-back wastes turns and shows the user a wall of identical "still provisioning" messages.',
  },
  {
    id: 'port-already-allocated',
    // Matches: `port is already allocated` / `bind: address already in use`.
    pattern: /(port\s+\S+\s+is already allocated|bind:\s+address already in use|Ports are not available)/i,
    diagnosis:
      'A different container or process on the host is already bound to the port this app is trying to claim.',
    requiredAction:
      'Use `apps_containers_list { uuid }` plus `shell_exec` (e.g. `docker ps --filter publish=<port>`) to identify the holder. If the holder is a stale Coolify-managed container, call `apps_unstick { uuid }` on its app. If it is a legitimate other app, surface the conflict to the user and ask which one should get the port.',
    antipattern:
      'Do NOT pick a random different port and retry. Port choice is part of the user\'s product configuration; a silent change will break their docs / DNS / clients.',
  },
];

/**
 * Inspect a tool result and return the matching recovery rule, or
 * null if nothing matches. The result is treated as plain text;
 * structured JSON tool results work fine because the error strings
 * we match on appear inside the JSON value.
 */
export function detectKnownError(toolResult: unknown): RecoveryRule | null {
  if (toolResult == null) return null;
  const text = typeof toolResult === 'string' ? toolResult : JSON.stringify(toolResult);
  for (const rule of RULES) {
    if (rule.pattern.test(text)) return rule;
  }
  return null;
}

/**
 * Format a recovery rule as the synthetic system message we inject
 * into the conversation before the next model round. The shape is
 * deliberately imperative ("CALL X. DO NOT do Y.") because that is
 * the prompting style the model responds to most reliably.
 */
export function formatRecoveryMessage(rule: RecoveryRule): string {
  return [
    `[RECOVERY: ${rule.id}]`,
    `Diagnosis: ${rule.diagnosis}`,
    `Required next action: ${rule.requiredAction}`,
    `Do NOT: ${rule.antipattern}`,
    `Send the user a one-line status before the recovery call so they know what you are doing.`,
  ].join('\n');
}