fix: retry 429 with backoff; switch Tier B/C to claude-sonnet-4-6
- VertexOpenAIClient: retry on 429/503 up to 4 times with exponential backoff (2s/4s/8s/16s + jitter), respects Retry-After header - Tier B/C default: zai-org/glm-5-maas → claude-sonnet-4-6 (much higher rate limits, still Vertex MaaS) - /agent/execute: accept continueTask param to run a follow-up within the original task context without starting a fresh session Made-with: Cursor
This commit is contained in:
43
src/llm.ts
43
src/llm.ts
@@ -114,7 +114,6 @@ export class VertexOpenAIClient implements LLMClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> {
|
async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> {
|
||||||
const token = await getVertexToken();
|
|
||||||
const base = this.region === 'global'
|
const base = this.region === 'global'
|
||||||
? 'https://aiplatform.googleapis.com'
|
? 'https://aiplatform.googleapis.com'
|
||||||
: `https://${this.region}-aiplatform.googleapis.com`;
|
: `https://${this.region}-aiplatform.googleapis.com`;
|
||||||
@@ -133,6 +132,12 @@ export class VertexOpenAIClient implements LLMClient {
|
|||||||
body.tool_choice = 'auto';
|
body.tool_choice = 'auto';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retry with exponential backoff on 429 / 503 (rate limit / overload)
|
||||||
|
const MAX_RETRIES = 4;
|
||||||
|
const RETRY_STATUSES = new Set([429, 503]);
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
||||||
|
const token = await getVertexToken();
|
||||||
const res = await fetch(url, {
|
const res = await fetch(url, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
@@ -142,17 +147,10 @@ export class VertexOpenAIClient implements LLMClient {
|
|||||||
body: JSON.stringify(body)
|
body: JSON.stringify(body)
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!res.ok) {
|
if (res.ok) {
|
||||||
const errText = await res.text();
|
|
||||||
// Force token refresh on 401
|
|
||||||
if (res.status === 401) _tokenExpiry = 0;
|
|
||||||
throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = await res.json() as any;
|
const data = await res.json() as any;
|
||||||
const choice = data.choices?.[0];
|
const choice = data.choices?.[0];
|
||||||
const message = choice?.message ?? {};
|
const message = choice?.message ?? {};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content: message.content ?? null,
|
content: message.content ?? null,
|
||||||
reasoning: message.reasoning_content ?? null,
|
reasoning: message.reasoning_content ?? null,
|
||||||
@@ -161,6 +159,29 @@ export class VertexOpenAIClient implements LLMClient {
|
|||||||
usage: data.usage
|
usage: data.usage
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const errText = await res.text();
|
||||||
|
|
||||||
|
// Force token refresh on 401
|
||||||
|
if (res.status === 401) _tokenExpiry = 0;
|
||||||
|
|
||||||
|
if (RETRY_STATUSES.has(res.status) && attempt < MAX_RETRIES) {
|
||||||
|
// Check for Retry-After header, otherwise use exponential backoff
|
||||||
|
const retryAfter = res.headers.get('retry-after');
|
||||||
|
const waitMs = retryAfter
|
||||||
|
? Math.min(parseInt(retryAfter, 10) * 1000, 60_000)
|
||||||
|
: Math.min(2 ** attempt * 2000 + Math.random() * 500, 30_000);
|
||||||
|
console.warn(`[llm] Vertex ${res.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1} — retrying in ${Math.round(waitMs / 1000)}s`);
|
||||||
|
await new Promise(r => setTimeout(r, waitMs));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TypeScript requires an explicit throw after the loop (unreachable in practice)
|
||||||
|
throw new Error('Vertex API: exceeded max retries');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -273,8 +294,8 @@ export type ModelTier = 'A' | 'B' | 'C';
|
|||||||
|
|
||||||
const TIER_MODELS: Record<ModelTier, string> = {
|
const TIER_MODELS: Record<ModelTier, string> = {
|
||||||
A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
|
A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
|
||||||
B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas',
|
B: process.env.TIER_B_MODEL ?? 'claude-sonnet-4-6',
|
||||||
C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas'
|
C: process.env.TIER_C_MODEL ?? 'claude-sonnet-4-6'
|
||||||
};
|
};
|
||||||
|
|
||||||
export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {
|
export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {
|
||||||
|
|||||||
@@ -359,13 +359,14 @@ app.post('/webhook/gitea', (req: Request, res: Response) => {
|
|||||||
const activeSessions = new Map<string, { stopped: boolean }>();
|
const activeSessions = new Map<string, { stopped: boolean }>();
|
||||||
|
|
||||||
app.post('/agent/execute', async (req: Request, res: Response) => {
|
app.post('/agent/execute', async (req: Request, res: Response) => {
|
||||||
const { sessionId, projectId, appName, appPath, giteaRepo, task } = req.body as {
|
const { sessionId, projectId, appName, appPath, giteaRepo, task, continueTask } = req.body as {
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
projectId?: string;
|
projectId?: string;
|
||||||
appName?: string;
|
appName?: string;
|
||||||
appPath?: string;
|
appPath?: string;
|
||||||
giteaRepo?: string;
|
giteaRepo?: string;
|
||||||
task?: string;
|
task?: string;
|
||||||
|
continueTask?: string; // if set, appended as follow-up to the original task
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!sessionId || !projectId || !appPath || !task) {
|
if (!sessionId || !projectId || !appPath || !task) {
|
||||||
@@ -418,8 +419,14 @@ app.post('/agent/execute', async (req: Request, res: Response) => {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If continuing a previous task, combine into a single prompt so the agent
|
||||||
|
// understands what was already attempted.
|
||||||
|
const effectiveTask = continueTask
|
||||||
|
? `Original task: ${task}\n\nFollow-up instruction: ${continueTask}`
|
||||||
|
: task!;
|
||||||
|
|
||||||
// Run the streaming agent loop (fire and forget)
|
// Run the streaming agent loop (fire and forget)
|
||||||
runSessionAgent(agentConfig, task, ctx, {
|
runSessionAgent(agentConfig, effectiveTask, ctx, {
|
||||||
sessionId,
|
sessionId,
|
||||||
projectId,
|
projectId,
|
||||||
vibnApiUrl,
|
vibnApiUrl,
|
||||||
|
|||||||
Reference in New Issue
Block a user