From c105b42d0cf429b22248160ac67a0586da5cf401 Mon Sep 17 00:00:00 2001 From: Mark Henderson Date: Fri, 1 May 2026 11:08:48 -0700 Subject: [PATCH] feat(ai): tool-error recovery middleware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pattern-matches known-recoverable MCP tool failures and injects a synthetic imperative message into the conversation right after the failing tool result. Static prompt rules lose to accumulated tool reality (we've shipped 4 orphan twenty-* services because the model ignored the "no delete-and-recreate" rule); a fresh role:'user' message at decision time does not. Initial rules cover the three highest-confidence Docker failure patterns: orphan container conflict (use apps_unstick), image pull denied (use apps_repair), port already allocated (identify holder). Each rule names the wrong-but-tempting move explicitly. See AI_HARNESS_GAPS.md §1 for the failure case this addresses. --- app/api/chat/route.ts | 19 +++++++ lib/ai/error-recovery.ts | 111 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 lib/ai/error-recovery.ts diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts index f60f6e34..332096cc 100644 --- a/app/api/chat/route.ts +++ b/app/api/chat/route.ts @@ -19,6 +19,7 @@ import { authSession } from '@/lib/auth/session-server'; import { query } from '@/lib/db-postgres'; import { callGeminiChat } from '@/lib/ai/gemini-chat'; import { VIBN_TOOL_DEFINITIONS, executeMcpTool } from '@/lib/ai/vibn-tools'; +import { detectKnownError, formatRecoveryMessage } from '@/lib/ai/error-recovery'; import type { ChatMessage, ToolCall } from '@/lib/ai/gemini-chat'; // Bumped from 6 to 12 because Path B chains (devcontainer.ensure → @@ -457,6 +458,24 @@ export async function POST(request: Request) { toolName: tc.name, thoughtSignature: tc.thoughtSignature, }); + + // Harness-layer error recovery: if the tool result matches + // a known-recoverable failure (e.g. orphan container + // conflict), inject a synthetic user-role message + // immediately after the tool result. This puts a fresh + // imperative ("CALL apps_unstick. DO NOT delete-and- + // recreate.") in the conversation right where the model + // is about to decide what to do next. Static prompt + // rules lose to accumulated tool reality; an injected + // message at decision time does not. See + // lib/ai/error-recovery.ts. + const recovery = detectKnownError(result); + if (recovery) { + messages.push({ + role: 'user', + content: formatRecoveryMessage(recovery), + }); + } } if (loopBreakReason) break; diff --git a/lib/ai/error-recovery.ts b/lib/ai/error-recovery.ts new file mode 100644 index 00000000..7f0c28f0 --- /dev/null +++ b/lib/ai/error-recovery.ts @@ -0,0 +1,111 @@ +/** + * Tool-error recovery middleware. + * + * Pattern-matches known-recoverable error strings in MCP tool results + * and produces a synthetic system message instructing the model on the + * exact recovery action. Injected into the conversation before the + * next model round. + * + * Why this exists (vs just a system-prompt rule): + * Static prompt rules against accumulating tool reality lose. We've + * shipped 4 orphan twenty-* services because the model kept doing + * delete-and-recreate even though the prompt told it not to. The + * model treats prompt rules as soft guidance; it cannot ignore a + * fresh `role: "system"` message that arrives between tool result + * and next call. See AI_HARNESS_GAPS.md §1 for the full case. + * + * Adding a rule: + * 1. Pick a regex that matches the error string with NO false + * positives. If it could fire on a legitimate success or + * unrelated failure, leave it out — silent miss > wrong fix. + * 2. Write the `diagnosis` as a sentence the model can use as-is + * in a status update to the user. + * 3. Write `requiredAction` as the literal next tool call(s) the + * model should make, with arg shapes if non-obvious. + * 4. Write `antipattern` as the wrong-but-tempting move the model + * keeps doing. The injected message tells it explicitly NOT + * to do this. + * + * Rules are checked in registration order. First match wins. + */ + +export interface RecoveryRule { + /** Stable identifier for logs / future telemetry. */ + id: string; + /** Pattern that uniquely identifies this error in tool output. */ + pattern: RegExp; + /** Human-readable explanation of what went wrong. */ + diagnosis: string; + /** Exact next tool call(s) the model should make. */ + requiredAction: string; + /** The wrong move the model keeps making for this error. */ + antipattern: string; +} + +const RULES: RecoveryRule[] = [ + { + id: 'orphan-container-conflict', + // Matches: `Conflict. The container name "/postgres-..." is already in use` + // Real prod example, twenty-crm thread, 2026-04-30. + pattern: /Conflict\.\s+The container name\s+["/]?[\w./-]+["/]?\s+is already in use/i, + diagnosis: + 'A previous deploy left an orphan Docker container holding this service\'s container name. The new boot is colliding with the orphan. This is a recoverable state.', + requiredAction: + 'Call `apps_unstick { uuid }` against the SAME app uuid you were just trying to deploy, then `apps_deploy { uuid }`. Both calls use the existing uuid; do not create a new app.', + antipattern: + 'Do NOT delete the failing app and create a new one with a different name. That keeps the orphan running, doubles the stack, and ships another shadow service. We have shipped 4 orphan twenty-* services this way before. Do not repeat it.', + }, + { + id: 'image-pull-denied', + // Matches: `pull access denied for ...` and `manifest unknown` from the registry. + pattern: /(pull access denied for|manifest unknown|repository does not exist)/i, + diagnosis: + 'The Docker image referenced by this app is not on the host, and the registry pull failed (private repo, missing credentials, or wrong tag).', + requiredAction: + 'Call `apps_repair { uuid }` to re-attempt the post-deploy fixes. If that fails too, surface the exact image reference to the user and ask whether the image should be pulled from a different registry or rebuilt.', + antipattern: + 'Do NOT retry the same `apps_deploy` blindly hoping the registry will respond differently. The pull failure is persistent until the underlying image-availability issue is fixed.', + }, + { + id: 'port-already-allocated', + // Matches: `port is already allocated` / `bind: address already in use`. + pattern: /(port\s+\S+\s+is already allocated|bind:\s+address already in use|Ports are not available)/i, + diagnosis: + 'A different container or process on the host is already bound to the port this app is trying to claim.', + requiredAction: + 'Use `apps_containers_list { uuid }` plus `shell_exec` (e.g. `docker ps --filter publish=`) to identify the holder. If the holder is a stale Coolify-managed container, call `apps_unstick { uuid }` on its app. If it is a legitimate other app, surface the conflict to the user and ask which one should get the port.', + antipattern: + 'Do NOT pick a random different port and retry. Port choice is part of the user\'s product configuration; a silent change will break their docs / DNS / clients.', + }, +]; + +/** + * Inspect a tool result and return the matching recovery rule, or + * null if nothing matches. The result is treated as plain text; + * structured JSON tool results work fine because the error strings + * we match on appear inside the JSON value. + */ +export function detectKnownError(toolResult: unknown): RecoveryRule | null { + if (toolResult == null) return null; + const text = typeof toolResult === 'string' ? toolResult : JSON.stringify(toolResult); + for (const rule of RULES) { + if (rule.pattern.test(text)) return rule; + } + return null; +} + +/** + * Format a recovery rule as the synthetic system message we inject + * into the conversation before the next model round. The shape is + * deliberately imperative ("CALL X. DO NOT do Y.") because that is + * the prompting style the model responds to most reliably. + */ +export function formatRecoveryMessage(rule: RecoveryRule): string { + return [ + `[RECOVERY: ${rule.id}]`, + `Diagnosis: ${rule.diagnosis}`, + `Required next action: ${rule.requiredAction}`, + `Do NOT: ${rule.antipattern}`, + `Send the user a one-line status before the recovery call so they know what you are doing.`, + ].join('\n'); +}