fix: retry 429 with backoff; switch Tier B/C to claude-sonnet-4-6

- VertexOpenAIClient: retry on 429/503 up to 4 times with exponential
  backoff (2s/4s/8s/16s + jitter), respects Retry-After header
- Tier B/C default: zai-org/glm-5-maas → claude-sonnet-4-6 (much higher
  rate limits, still Vertex MaaS)
- /agent/execute: accept continueTask param to run a follow-up within
  the original task context without starting a fresh session

Made-with: Cursor
This commit is contained in:
2026-03-07 12:25:51 -08:00
parent b16a216e0e
commit 487c13317c
2 changed files with 53 additions and 25 deletions

View File

@@ -114,7 +114,6 @@ export class VertexOpenAIClient implements LLMClient {
} }
async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> { async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> {
const token = await getVertexToken();
const base = this.region === 'global' const base = this.region === 'global'
? 'https://aiplatform.googleapis.com' ? 'https://aiplatform.googleapis.com'
: `https://${this.region}-aiplatform.googleapis.com`; : `https://${this.region}-aiplatform.googleapis.com`;
@@ -133,33 +132,55 @@ export class VertexOpenAIClient implements LLMClient {
body.tool_choice = 'auto'; body.tool_choice = 'auto';
} }
const res = await fetch(url, { // Retry with exponential backoff on 429 / 503 (rate limit / overload)
method: 'POST', const MAX_RETRIES = 4;
headers: { const RETRY_STATUSES = new Set([429, 503]);
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json' for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
}, const token = await getVertexToken();
body: JSON.stringify(body) const res = await fetch(url, {
}); method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json'
},
body: JSON.stringify(body)
});
if (res.ok) {
const data = await res.json() as any;
const choice = data.choices?.[0];
const message = choice?.message ?? {};
return {
content: message.content ?? null,
reasoning: message.reasoning_content ?? null,
tool_calls: message.tool_calls ?? [],
finish_reason: choice?.finish_reason ?? 'stop',
usage: data.usage
};
}
if (!res.ok) {
const errText = await res.text(); const errText = await res.text();
// Force token refresh on 401 // Force token refresh on 401
if (res.status === 401) _tokenExpiry = 0; if (res.status === 401) _tokenExpiry = 0;
if (RETRY_STATUSES.has(res.status) && attempt < MAX_RETRIES) {
// Check for Retry-After header, otherwise use exponential backoff
const retryAfter = res.headers.get('retry-after');
const waitMs = retryAfter
? Math.min(parseInt(retryAfter, 10) * 1000, 60_000)
: Math.min(2 ** attempt * 2000 + Math.random() * 500, 30_000);
console.warn(`[llm] Vertex ${res.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1} — retrying in ${Math.round(waitMs / 1000)}s`);
await new Promise(r => setTimeout(r, waitMs));
continue;
}
throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`); throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`);
} }
const data = await res.json() as any; // TypeScript requires an explicit throw after the loop (unreachable in practice)
const choice = data.choices?.[0]; throw new Error('Vertex API: exceeded max retries');
const message = choice?.message ?? {};
return {
content: message.content ?? null,
reasoning: message.reasoning_content ?? null,
tool_calls: message.tool_calls ?? [],
finish_reason: choice?.finish_reason ?? 'stop',
usage: data.usage
};
} }
} }
@@ -273,8 +294,8 @@ export type ModelTier = 'A' | 'B' | 'C';
const TIER_MODELS: Record<ModelTier, string> = { const TIER_MODELS: Record<ModelTier, string> = {
A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash', A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas', B: process.env.TIER_B_MODEL ?? 'claude-sonnet-4-6',
C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas' C: process.env.TIER_C_MODEL ?? 'claude-sonnet-4-6'
}; };
export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient { export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {

View File

@@ -359,13 +359,14 @@ app.post('/webhook/gitea', (req: Request, res: Response) => {
const activeSessions = new Map<string, { stopped: boolean }>(); const activeSessions = new Map<string, { stopped: boolean }>();
app.post('/agent/execute', async (req: Request, res: Response) => { app.post('/agent/execute', async (req: Request, res: Response) => {
const { sessionId, projectId, appName, appPath, giteaRepo, task } = req.body as { const { sessionId, projectId, appName, appPath, giteaRepo, task, continueTask } = req.body as {
sessionId?: string; sessionId?: string;
projectId?: string; projectId?: string;
appName?: string; appName?: string;
appPath?: string; appPath?: string;
giteaRepo?: string; giteaRepo?: string;
task?: string; task?: string;
continueTask?: string; // if set, appended as follow-up to the original task
}; };
if (!sessionId || !projectId || !appPath || !task) { if (!sessionId || !projectId || !appPath || !task) {
@@ -418,8 +419,14 @@ app.post('/agent/execute', async (req: Request, res: Response) => {
return; return;
} }
// If continuing a previous task, combine into a single prompt so the agent
// understands what was already attempted.
const effectiveTask = continueTask
? `Original task: ${task}\n\nFollow-up instruction: ${continueTask}`
: task!;
// Run the streaming agent loop (fire and forget) // Run the streaming agent loop (fire and forget)
runSessionAgent(agentConfig, task, ctx, { runSessionAgent(agentConfig, effectiveTask, ctx, {
sessionId, sessionId,
projectId, projectId,
vibnApiUrl, vibnApiUrl,