vibn-agent-runner/src/llm.ts

import { GoogleAuth } from 'google-auth-library';
import { GoogleGenAI } from '@google/genai';
import { v4 as uuidv4 } from 'uuid';

// =============================================================================
// Unified LLM client — OpenAI-compatible message format throughout
//
// Two backends:
//   VertexOpenAIClient  — for GLM-5, Claude Sonnet, etc. via Vertex global endpoint
//   GeminiFlashClient   — for Gemini Flash/Pro via @google/genai SDK
//
// Model tier defaults (overridable via env):
//   Tier A: gemini-2.5-flash  ($0.15/$0.60 per 1M)  — routing, summaries, log parsing
//   Tier B: zai-org/glm-5-maas ($1.00/$3.20 per 1M) — coding, feature work
//   Tier C: zai-org/glm-5-maas ($1.00/$3.20 per 1M) — complex decisions, escalation
// =============================================================================

// ---------------------------------------------------------------------------
// Shared message types (OpenAI format — used everywhere internally)
// ---------------------------------------------------------------------------

export interface LLMMessage {
    role: 'system' | 'user' | 'assistant' | 'tool';
    content: string | null;
    tool_calls?: LLMToolCall[];
    tool_call_id?: string; // set on role=tool messages
    name?: string;         // function name on role=tool messages
}

export interface LLMToolCall {
    id: string;
    type: 'function';
    function: {
        name: string;
        arguments: string; // JSON-encoded string
    };
}

export interface LLMTool {
    type: 'function';
    function: {
        name: string;
        description: string;
        parameters: Record<string, unknown>;
    };
}

export interface LLMResponse {
    content: string | null;
    reasoning: string | null; // GLM-5 chain-of-thought
    tool_calls: LLMToolCall[];
    finish_reason: string;
    usage?: {
        prompt_tokens: number;
        completion_tokens: number;
        total_tokens: number;
    };
}

export interface LLMClient {
    modelId: string;
    chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens?: number): Promise<LLMResponse>;
}

// ---------------------------------------------------------------------------
// Vertex AI OpenAI-compatible client
// Used for: zai-org/glm-5-maas, anthropic/claude-sonnet-4-6, etc.
// ---------------------------------------------------------------------------

let _cachedToken = '';
let _tokenExpiry = 0;

const _googleAuth = new GoogleAuth({
    scopes: ['https://www.googleapis.com/auth/cloud-platform']
});

async function getVertexToken(): Promise<string> {
    const now = Date.now();
    if (_cachedToken && now < _tokenExpiry) return _cachedToken;
    const client = await _googleAuth.getClient();
    const tokenResponse = await client.getAccessToken();
    _cachedToken = tokenResponse.token!;
    _tokenExpiry = now + 55 * 60 * 1000; // tokens last 1hr, refresh at 55min
    return _cachedToken;
}

export class VertexOpenAIClient implements LLMClient {
    modelId: string;
    private projectId: string;
    private region: string;
    private temperature: number;

    constructor(modelId: string, opts?: { projectId?: string; region?: string; temperature?: number }) {
        this.modelId = modelId;
        this.projectId = opts?.projectId ?? process.env.GCP_PROJECT_ID ?? 'master-ai-484822';
        this.region = opts?.region ?? 'global';
        this.temperature = opts?.temperature ?? 0.3;
    }

    async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 4096): Promise<LLMResponse> {
        const token = await getVertexToken();
        const base = this.region === 'global'
            ? 'https://aiplatform.googleapis.com'
            : `https://${this.region}-aiplatform.googleapis.com`;
        const url = `${base}/v1/projects/${this.projectId}/locations/${this.region}/endpoints/openapi/chat/completions`;

        const body: Record<string, unknown> = {
            model: this.modelId,
            messages,
            max_tokens: maxTokens,
            temperature: this.temperature,
            stream: false
        };

        if (tools && tools.length > 0) {
            body.tools = tools;
            body.tool_choice = 'auto';
        }

        const res = await fetch(url, {
            method: 'POST',
            headers: {
                'Authorization': `Bearer ${token}`,
                'Content-Type': 'application/json'
            },
            body: JSON.stringify(body)
        });

        if (!res.ok) {
            const errText = await res.text();
            // Force token refresh on 401
            if (res.status === 401) _tokenExpiry = 0;
            throw new Error(`Vertex API ${res.status}: ${errText.slice(0, 400)}`);
        }

        const data = await res.json() as any;
        const choice = data.choices?.[0];
        const message = choice?.message ?? {};

        return {
            content: message.content ?? null,
            reasoning: message.reasoning_content ?? null,
            tool_calls: message.tool_calls ?? [],
            finish_reason: choice?.finish_reason ?? 'stop',
            usage: data.usage
        };
    }
}

// ---------------------------------------------------------------------------
// Gemini client via @google/genai SDK
// Used for: Tier A (fast/cheap routing, summaries, log parsing)
// Converts to/from OpenAI message format internally.
// ---------------------------------------------------------------------------

export class GeminiClient implements LLMClient {
    modelId: string;
    private temperature: number;

    constructor(modelId = 'gemini-2.5-flash', opts?: { temperature?: number }) {
        this.modelId = modelId;
        this.temperature = opts?.temperature ?? 0.2;
    }

    async chat(messages: LLMMessage[], tools?: LLMTool[], maxTokens = 8192): Promise<LLMResponse> {
        const apiKey = process.env.GOOGLE_API_KEY;
        if (!apiKey) throw new Error('GOOGLE_API_KEY not set');

        const genai = new GoogleGenAI({ apiKey });

        const systemMsg = messages.find(m => m.role === 'system');
        const nonSystem = messages.filter(m => m.role !== 'system');

        const functionDeclarations = (tools ?? []).map(t => ({
            name: t.function.name,
            description: t.function.description,
            parameters: t.function.parameters as any
        }));

        const response = await genai.models.generateContent({
            model: this.modelId,
            contents: toGeminiContents(nonSystem),
            config: {
                systemInstruction: systemMsg?.content ?? undefined,
                tools: functionDeclarations.length > 0 ? [{ functionDeclarations }] : undefined,
                temperature: this.temperature,
                maxOutputTokens: maxTokens
            }
        });

        const candidate = response.candidates?.[0];
        if (!candidate) throw new Error('No response from Gemini');

        const parts = candidate.content?.parts ?? [];
        const textContent = parts.filter(p => p.text).map(p => p.text).join('') || null;
        const fnCalls = parts.filter(p => p.functionCall);

        const tool_calls: LLMToolCall[] = fnCalls.map(p => ({
            id: `call_${uuidv4().replace(/-/g, '').slice(0, 12)}`,
            type: 'function' as const,
            function: {
                name: p.functionCall!.name ?? '',
                arguments: JSON.stringify(p.functionCall!.args ?? {})
            }
        }));

        return {
            content: textContent,
            reasoning: null,
            tool_calls,
            finish_reason: fnCalls.length > 0 ? 'tool_calls' : 'stop'
        };
    }
}

/** Convert OpenAI message format → Gemini Content[] format */
function toGeminiContents(messages: LLMMessage[]): any[] {
    const contents: any[] = [];
    for (const msg of messages) {
        if (msg.role === 'assistant') {
            const parts: any[] = [];
            if (msg.content) parts.push({ text: msg.content });
            for (const tc of msg.tool_calls ?? []) {
                parts.push({
                    functionCall: {
                        name: tc.function.name,
                        args: JSON.parse(tc.function.arguments || '{}')
                    }
                });
            }
            contents.push({ role: 'model', parts });
        } else if (msg.role === 'tool') {
            // Parse content back — could be JSON or plain text
            let resultValue: unknown = msg.content;
            try { resultValue = JSON.parse(msg.content ?? 'null'); } catch { /* keep as string */ }
            contents.push({
                role: 'user',
                parts: [{
                    functionResponse: {
                        name: msg.name ?? 'tool',
                        response: { result: resultValue }
                    }
                }]
            });
        } else {
            contents.push({ role: 'user', parts: [{ text: msg.content ?? '' }] });
        }
    }
    return contents;
}

// ---------------------------------------------------------------------------
// Factory — createLLM(modelId | tier)
// ---------------------------------------------------------------------------

export type ModelTier = 'A' | 'B' | 'C';

const TIER_MODELS: Record<ModelTier, string> = {
    A: process.env.TIER_A_MODEL ?? 'gemini-2.5-flash',
    B: process.env.TIER_B_MODEL ?? 'zai-org/glm-5-maas',
    C: process.env.TIER_C_MODEL ?? 'zai-org/glm-5-maas'
};

export function createLLM(modelOrTier: string | ModelTier, opts?: { temperature?: number }): LLMClient {
    const modelId = (modelOrTier === 'A' || modelOrTier === 'B' || modelOrTier === 'C')
        ? TIER_MODELS[modelOrTier]
        : modelOrTier;

    if (modelId.startsWith('gemini-')) {
        return new GeminiClient(modelId, opts);
    }

    return new VertexOpenAIClient(modelId, { temperature: opts?.temperature });
}

// ---------------------------------------------------------------------------
// Helper — convert our ToolDefinition[] → LLMTool[] (OpenAI format)
// ---------------------------------------------------------------------------

export function toOAITools(
    tools: Array<{ name: string; description: string; parameters: Record<string, unknown> }>
): LLMTool[] {
    return tools.map(t => ({
        type: 'function',
        function: {
            name: t.name,
            description: t.description,
            parameters: t.parameters
        }
    }));
}