From 487c13317c1b4ff4469aa2d0d9b71db8df684ed9 Mon Sep 17 00:00:00 2001
From: mawkone <mark@getacquired.com>
Date: Sat, 7 Mar 2026 12:25:51 -0800
Subject: [PATCH] fix: retry 429 with backoff; switch Tier B/C to
 claude-sonnet-4-6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- VertexOpenAIClient: retry on 429/503 up to 4 times with exponential
  backoff (2s/4s/8s/16s + jitter), respects Retry-After header
- Tier B/C default: zai-org/glm-5-maas → claude-sonnet-4-6 (much higher
  rate limits, still Vertex MaaS)
- /agent/execute: accept continueTask param to run a follow-up within
  the original task context without starting a fresh session

Made-with: Cursor
---
 src/llm.ts    | 67 +++++++++++++++++++++++++++++++++------------------
 src/server.ts | 11 +++++++--
 2 files changed, 53 insertions(+), 25 deletions(-)
diff --git a/src/llm.ts b/src/llm.ts
index 9620959..84408f0 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -114,7 +114,6 @@ export class VertexOpenAIClient implements LLMClient {
     }
 
     async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> {
-        const token = await getVertexToken();
         const base = this.region === 'global'
             ? 'https://aiplatform.googleapis.com'
             : `https://${this.region}-aiplatform.googleapis.com`;
@@ -133,33 +132,55 @@ export class VertexOpenAIClient implements LLMClient {
             body.tool_choice = 'auto';
         }
 
-        const res = await fetch(url, {
-            method: 'POST',
-            headers: {
-                'Authorization': `Bearer ${token}`,
-                'Content-Type': 'application/json'
-            },
-            body: JSON.stringify(body)
-        });
+        // Retry with exponential backoff on 429 / 503 (rate limit / overload)
+        const MAX_RETRIES = 4;
+        const RETRY_STATUSES = new Set([429, 503]);
+
+        for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
+            const token = await getVertexToken();
+            const res = await fetch(url, {
+                method: 'POST',
+                headers: {
+                    'Authorization': `Bearer ${token}`,
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify(body)
+            });
+
+            if (res.ok) {
+                const data = await res.json() as any;
+                const choice = data.choices?.[0];
+                const message = choice?.message ?? {};
+                return {
+                    content: message.content ?? null,
+                    reasoning: message.reasoning_content ?? null,
+                    tool_calls: message.tool_calls ?? [],
+                    finish_reason: choice?.finish_reason ?? 'stop',
+                    usage: data.usage
+                };
+            }
 
-        if (!res.ok) {
             const errText = await res.text();
+
             // Force token refresh on 401
             if (res.status === 401) _tokenExpiry = 0;
+
+            if (RETRY_STATUSES.has(res.status) && attempt < MAX_RETRIES) {
+                // Check for Retry-After header, otherwise use exponential backoff
+                const retryAfter = res.headers.get('retry-after');
+                const waitMs = retryAfter
+                    ? Math.min(parseInt(retryAfter, 10) * 1000, 60_000)
+                    : Math.min(2 ** attempt * 2000 + Math.random() * 500, 30_000);
+                console.warn(`[llm] Vertex ${res.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1} — retrying in ${Math.round(waitMs / 1000)}s`);
+                await new Promise(r => setTimeout(r, waitMs));
+                continue;
+            }
+
             throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`);
         }
 
-        const data = await res.json() as any;
-        const choice = data.choices?.[0];
-        const message = choice?.message ?? {};
-
-        return {
-            content: message.content ?? null,
-            reasoning: message.reasoning_content ?? null,
-            tool_calls: message.tool_calls ?? [],
-            finish_reason: choice?.finish_reason ?? 'stop',
-            usage: data.usage
-        };
+        // TypeScript requires an explicit throw after the loop (unreachable in practice)
+        throw new Error('Vertex API: exceeded max retries');
     }
 }
 
@@ -273,8 +294,8 @@ export type ModelTier = 'A' | 'B' | 'C';
 
 const TIER_MODELS: Record<ModelTier, string> = {
     A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
-    B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas',
-    C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas'
+    B: process.env.TIER_B_MODEL ?? 'claude-sonnet-4-6',
+    C: process.env.TIER_C_MODEL ?? 'claude-sonnet-4-6'
 };
 
 export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {
diff --git a/src/server.ts b/src/server.ts
index ffb1f41..2e71c56 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -359,13 +359,14 @@ app.post('/webhook/gitea', (req: Request, res: Response) => {
 const activeSessions = new Map<string, { stopped: boolean }>();
 
 app.post('/agent/execute', async (req: Request, res: Response) => {
-    const { sessionId, projectId, appName, appPath, giteaRepo, task } = req.body as {
+    const { sessionId, projectId, appName, appPath, giteaRepo, task, continueTask } = req.body as {
         sessionId?: string;
         projectId?: string;
         appName?: string;
         appPath?: string;
         giteaRepo?: string;
         task?: string;
+        continueTask?: string; // if set, appended as follow-up to the original task
     };
 
     if (!sessionId || !projectId || !appPath || !task) {
@@ -418,8 +419,14 @@ app.post('/agent/execute', async (req: Request, res: Response) => {
         return;
     }
 
+    // If continuing a previous task, combine into a single prompt so the agent
+    // understands what was already attempted.
+    const effectiveTask = continueTask
+        ? `Original task: ${task}\n\nFollow-up instruction: ${continueTask}`
+        : task!;
+
     // Run the streaming agent loop (fire and forget)
-    runSessionAgent(agentConfig, task, ctx, {
+    runSessionAgent(agentConfig, effectiveTask, ctx, {
         sessionId,
         projectId,
         vibnApiUrl,