From 487c13317c1b4ff4469aa2d0d9b71db8df684ed9 Mon Sep 17 00:00:00 2001 From: mawkone Date: Sat, 7 Mar 2026 12:25:51 -0800 Subject: [PATCH] fix: retry 429 with backoff; switch Tier B/C to claude-sonnet-4-6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - VertexOpenAIClient: retry on 429/503 up to 4 times with exponential backoff (2s/4s/8s/16s + jitter), respects Retry-After header - Tier B/C default: zai-org/glm-5-maas → claude-sonnet-4-6 (much higher rate limits, still Vertex MaaS) - /agent/execute: accept continueTask param to run a follow-up within the original task context without starting a fresh session Made-with: Cursor --- src/llm.ts | 67 +++++++++++++++++++++++++++++++++------------------ src/server.ts | 11 +++++++-- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 9620959..84408f0 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -114,7 +114,6 @@ export class VertexOpenAIClient implements LLMClient { } async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise { - const token = await getVertexToken(); const base = this.region === 'global' ? 'https://aiplatform.googleapis.com' : `https://${this.region}-aiplatform.googleapis.com`; @@ -133,33 +132,55 @@ export class VertexOpenAIClient implements LLMClient { body.tool_choice = 'auto'; } - const res = await fetch(url, { - method: 'POST', - headers: { - 'Authorization': `Bearer ${token}`, - 'Content-Type': 'application/json' - }, - body: JSON.stringify(body) - }); + // Retry with exponential backoff on 429 / 503 (rate limit / overload) + const MAX_RETRIES = 4; + const RETRY_STATUSES = new Set([429, 503]); + + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + const token = await getVertexToken(); + const res = await fetch(url, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body) + }); + + if (res.ok) { + const data = await res.json() as any; + const choice = data.choices?.[0]; + const message = choice?.message ?? {}; + return { + content: message.content ?? null, + reasoning: message.reasoning_content ?? null, + tool_calls: message.tool_calls ?? [], + finish_reason: choice?.finish_reason ?? 'stop', + usage: data.usage + }; + } - if (!res.ok) { const errText = await res.text(); + // Force token refresh on 401 if (res.status === 401) _tokenExpiry = 0; + + if (RETRY_STATUSES.has(res.status) && attempt < MAX_RETRIES) { + // Check for Retry-After header, otherwise use exponential backoff + const retryAfter = res.headers.get('retry-after'); + const waitMs = retryAfter + ? Math.min(parseInt(retryAfter, 10) * 1000, 60_000) + : Math.min(2 ** attempt * 2000 + Math.random() * 500, 30_000); + console.warn(`[llm] Vertex ${res.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1} — retrying in ${Math.round(waitMs / 1000)}s`); + await new Promise(r => setTimeout(r, waitMs)); + continue; + } + throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`); } - const data = await res.json() as any; - const choice = data.choices?.[0]; - const message = choice?.message ?? {}; - - return { - content: message.content ?? null, - reasoning: message.reasoning_content ?? null, - tool_calls: message.tool_calls ?? [], - finish_reason: choice?.finish_reason ?? 'stop', - usage: data.usage - }; + // TypeScript requires an explicit throw after the loop (unreachable in practice) + throw new Error('Vertex API: exceeded max retries'); } } @@ -273,8 +294,8 @@ export type ModelTier = 'A' | 'B' | 'C'; const TIER_MODELS: Record = { A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash', - B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas', - C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas' + B: process.env.TIER_B_MODEL ?? 'claude-sonnet-4-6', + C: process.env.TIER_C_MODEL ?? 'claude-sonnet-4-6' }; export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient { diff --git a/src/server.ts b/src/server.ts index ffb1f41..2e71c56 100644 --- a/src/server.ts +++ b/src/server.ts @@ -359,13 +359,14 @@ app.post('/webhook/gitea', (req: Request, res: Response) => { const activeSessions = new Map(); app.post('/agent/execute', async (req: Request, res: Response) => { - const { sessionId, projectId, appName, appPath, giteaRepo, task } = req.body as { + const { sessionId, projectId, appName, appPath, giteaRepo, task, continueTask } = req.body as { sessionId?: string; projectId?: string; appName?: string; appPath?: string; giteaRepo?: string; task?: string; + continueTask?: string; // if set, appended as follow-up to the original task }; if (!sessionId || !projectId || !appPath || !task) { @@ -418,8 +419,14 @@ app.post('/agent/execute', async (req: Request, res: Response) => { return; } + // If continuing a previous task, combine into a single prompt so the agent + // understands what was already attempted. + const effectiveTask = continueTask + ? `Original task: ${task}\n\nFollow-up instruction: ${continueTask}` + : task!; + // Run the streaming agent loop (fire and forget) - runSessionAgent(agentConfig, task, ctx, { + runSessionAgent(agentConfig, effectiveTask, ctx, { sessionId, projectId, vibnApiUrl,