feat: add AnthropicVertexClient for claude-* models

- @anthropic-ai/vertex-sdk: proper Anthropic Messages API on Vertex
- AnthropicVertexClient: converts OAI message format ↔ Anthropic format,
  handles tool_use blocks, retries 429/503 with backoff
- createLLM: routes anthropic/* and claude-* models through new client
- Tier B/C default: claude-sonnet-4-6 via us-east5 Vertex endpoint
- /generate endpoint: accepts region param for regional endpoint testing

Made-with: Cursor
This commit is contained in:
2026-03-07 12:54:39 -08:00
parent 9c8a08a686
commit 551fdb9e54
4 changed files with 366 additions and 13 deletions

View File

@@ -1,20 +1,20 @@
import { GoogleAuth } from 'google-auth-library';
import { GoogleGenAI } from '@google/genai';
import AnthropicVertex from '@anthropic-ai/vertex-sdk';
import { v4 as uuidv4 } from 'uuid';
// =============================================================================
// Unified LLM client — OpenAI-compatible message format throughout
//
// Two backends:
// VertexOpenAIClient — for GLM-5 and other Vertex MaaS models (openai-compat endpoint)
// GeminiClient — for Gemini Flash/Pro via @google/genai SDK (API key)
// Three backends:
// VertexOpenAIClient — GLM-5 and other Vertex MaaS models (openai-compat endpoint)
// GeminiClient — Gemini Flash/Pro via @google/genai SDK (API key)
// AnthropicVertexClient — Claude models via Anthropic Messages API on Vertex (us-east5)
//
// Model tier defaults (overridable via TIER_A/B/C_MODEL env vars):
// Tier A: gemini-2.5-flash — routing, summaries, log parsing (API key, high quota)
// Tier B: zai-org/glm-5-maas — coding, feature work (Vertex MaaS, retry on 429)
// Tier C: zai-org/glm-5-maas — complex decisions, escalation
//
// Claude models are NOT available in this GCP project — do not use anthropic/* IDs.
// Tier A: gemini-2.5-flash — routing, summaries (API key, high quota)
// Tier B: claude-sonnet-4-6 — coding, feature work (Anthropic Vertex, us-east5)
// Tier C: claude-sonnet-4-6 — complex decisions
// =============================================================================
// ---------------------------------------------------------------------------
@@ -288,6 +288,128 @@ function toGeminiContents(messages: LLMMessage[]): any[] {
return contents;
}
// ---------------------------------------------------------------------------
// Anthropic Vertex client
// Used for: claude-* models via Vertex AI (proper Anthropic Messages API)
// Handles tool_calls by converting to/from Anthropic's tool_use blocks.
// ---------------------------------------------------------------------------
export class AnthropicVertexClient implements LLMClient {
modelId: string;
private projectId: string;
private region: string;
constructor(modelId: string, opts?: { projectId?: string; region?: string }) {
// Strip the "anthropic/" prefix if present — the SDK uses bare model names
this.modelId = modelId.startsWith('anthropic/') ? modelId.slice(10) : modelId;
this.projectId = opts?.projectId ?? process.env.GCP_PROJECT_ID ?? 'master-ai-484822';
this.region = opts?.region ?? process.env.CLAUDE_REGION ?? 'us-east5';
}
private buildClient(): AnthropicVertex {
const b64Key = process.env.GCP_SA_KEY_BASE64;
if (b64Key) {
try {
const jsonStr = Buffer.from(b64Key, 'base64').toString('utf8');
const credentials = JSON.parse(jsonStr);
return new AnthropicVertex({
projectId: this.projectId,
region: this.region,
googleAuth: new GoogleAuth({ credentials, scopes: ['https://www.googleapis.com/auth/cloud-platform'] }) as any,
});
} catch {
console.warn('[llm] AnthropicVertex: SA key decode failed, falling back to metadata server');
}
}
return new AnthropicVertex({ projectId: this.projectId, region: this.region });
}
async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 8192): Promise<LLMResponse> {
const client = this.buildClient();
const system = messages.find(m => m.role === 'system')?.content ?? undefined;
const nonSystem = messages.filter(m => m.role !== 'system');
// Convert OpenAI message format → Anthropic format
const anthropicMessages: any[] = nonSystem.map(m => {
if (m.role === 'assistant') {
const parts: any[] = [];
if (m.content) parts.push({ type: 'text', text: m.content });
for (const tc of m.tool_calls ?? []) {
parts.push({
type: 'tool_use',
id: tc.id,
name: tc.function.name,
input: JSON.parse(tc.function.arguments || '{}'),
});
}
return { role: 'assistant', content: parts.length === 1 && parts[0].type === 'text' ? parts[0].text : parts };
}
if (m.role === 'tool') {
return {
role: 'user',
content: [{ type: 'tool_result', tool_use_id: m.tool_call_id, content: m.content ?? '' }],
};
}
return { role: 'user', content: m.content ?? '' };
});
const anthropicTools = (tools ?? []).map(t => ({
name: t.function.name,
description: t.function.description,
input_schema: t.function.parameters,
}));
const MAX_RETRIES = 4;
const RETRY_STATUSES = new Set([429, 503]);
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
try {
const response = await (client.messages.create as Function)({
model: this.modelId,
max_tokens: maxTokens,
system: system ?? undefined,
messages: anthropicMessages,
tools: anthropicTools.length > 0 ? anthropicTools : undefined,
});
const textContent = response.content
.filter((b: any) => b.type === 'text')
.map((b: any) => b.text)
.join('') || null;
const tool_calls: LLMToolCall[] = response.content
.filter((b: any) => b.type === 'tool_use')
.map((b: any) => ({
id: b.id,
type: 'function' as const,
function: { name: b.name, arguments: JSON.stringify(b.input ?? {}) },
}));
return {
content: textContent,
reasoning: null,
tool_calls,
finish_reason: response.stop_reason === 'tool_use' ? 'tool_calls' : 'stop',
usage: response.usage
? { prompt_tokens: response.usage.input_tokens, completion_tokens: response.usage.output_tokens, total_tokens: response.usage.input_tokens + response.usage.output_tokens }
: undefined,
};
} catch (err: any) {
const status = err?.status ?? err?.statusCode ?? 0;
if (RETRY_STATUSES.has(status) && attempt < MAX_RETRIES) {
const waitMs = Math.min(2 ** attempt * 2000 + Math.random() * 500, 30_000);
console.warn(`[llm] Anthropic Vertex ${status} on attempt ${attempt + 1}/${MAX_RETRIES + 1} — retrying in ${Math.round(waitMs / 1000)}s`);
await new Promise(r => setTimeout(r, waitMs));
continue;
}
throw new Error(`Anthropic Vertex error: ${err?.message ?? String(err)}`);
}
}
throw new Error('Anthropic Vertex: exceeded max retries');
}
}
// ---------------------------------------------------------------------------
// Factory — createLLM(modelId | tier)
// ---------------------------------------------------------------------------
@@ -296,8 +418,8 @@ export type ModelTier = 'A' | 'B' | 'C';
const TIER_MODELS: Record<ModelTier, string> = {
A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas',
C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas'
B: process.env.TIER_B_MODEL ?? 'claude-sonnet-4-6',
C: process.env.TIER_C_MODEL ?? 'claude-sonnet-4-6'
};
export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {
@@ -309,6 +431,10 @@ export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?
return new GeminiClient(modelId, opts);
}
if (modelId.startsWith('anthropic/') || modelId.startsWith('claude-')) {
return new AnthropicVertexClient(modelId);
}
return new VertexOpenAIClient(modelId, { temperature: opts?.temperature });
}

View File

@@ -550,18 +550,24 @@ app.post('/agent/approve', async (req: Request, res: Response) => {
// ---------------------------------------------------------------------------
app.post('/generate', async (req: Request, res: Response) => {
const { prompt, model } = req.body as { prompt?: string; model?: string };
const { prompt, model, region } = req.body as { prompt?: string; model?: string; region?: string };
if (!prompt) { res.status(400).json({ error: '"prompt" is required' }); return; }
// Allow overriding CLAUDE_REGION per-request for testing
const prevRegion = process.env.CLAUDE_REGION;
if (region) process.env.CLAUDE_REGION = region;
try {
const llm = createLLM(model ?? 'A', { temperature: 0.3 });
const messages: import('./llm').LLMMessage[] = [
{ role: 'user', content: prompt }
];
const response = await llm.chat(messages, [], 8192);
res.json({ reply: response.content ?? '' });
res.json({ reply: response.content ?? '', model: llm.modelId });
} catch (err) {
res.status(500).json({ error: err instanceof Error ? err.message : String(err) });
res.status(500).json({ error: err instanceof Error ? err.message : String(err), model });
} finally {
if (region) process.env.CLAUDE_REGION = prevRegion ?? '';
}
});