feat(telemetry): emit per-turn governor summary (stop_reason, rounds, tool_results) for orchestration diagnostics

2026-06-10 17:07:43 -07:00
parent e2499d70af
commit 6d2ab7bd33
3 changed files with 112 additions and 16 deletions
--- a/app/api/chat/route.ts
+++ b/app/api/chat/route.ts
@@ -109,6 +109,7 @@ import { buildDesignKitPromptSection } from "@/lib/design-kits/for-ai";
 import { buildCodebaseSummary } from "@/lib/ai/codebase-summary";
 import { execInDevContainer } from "@/lib/dev-container";
 import type { ChatMessage, ToolCall } from "@/lib/ai/gemini-chat";
+import { logTurnSummary } from "@/lib/ai/telemetry-db";

 // C-01: Raised to 150. Provides a virtually unlimited, elite engineering runway
 // for complex custom application building, while the State-Based
@@ -1456,6 +1457,46 @@ export async function POST(request: Request) {
          });
        }

+        // ---- Orchestration telemetry: one turn_summary per user turn ----
+        // Records WHY the agent loop ended so we can diagnose and tune the
+        // governor (premature stops, loop cut-offs). Fire-and-forget.
+        try {
+          const stopReason = aborted
+            ? "user_aborted"
+            : loopBreakReason
+              ? `loop_detected:${String(loopBreakReason).slice(0, 160)}`
+              : maxToolRounds > 0 && round >= maxToolRounds
+                ? "round_cap"
+                : lastToolResultsHadFailure(messages)
+                  ? "tool_failure"
+                  : roundsSinceText >= 30
+                    ? "silent_rounds"
+                    : assistantToolCalls.length === 0 &&
+                        assistantText.trim().length === 0
+                      ? "empty_no_tools"
+                      : "completed";
+
+          logTurnSummary({
+            projectId: activeProject?.id,
+            sessionId: thread_id,
+            userMessage: message,
+            model: process.env.VIBN_CHAT_MODEL || "gemini-3.1-pro-preview",
+            response: {
+              text: assistantText,
+              thoughts: "",
+              toolCalls: assistantToolCalls,
+            },
+            toolResults: finalMsg._rawToolResults ?? [],
+            stopReason,
+            rounds: round,
+            toolCallCount: assistantToolCalls.length,
+            turnIntent,
+            chatMode,
+          });
+        } catch {
+          // never let telemetry interfere with the turn
+        }
+
        await query(
          `INSERT INTO fs_chat_messages (thread_id, user_id, data) VALUES ($1, $2, $3)`,
          [thread_id, email, JSON.stringify(finalMsg)],
--- a/lib/ai/telemetry-db.ts
+++ b/lib/ai/telemetry-db.ts
@@ -16,31 +16,51 @@ export interface TelemetryPayload {
  };
 }

-// Fire and forget function to send telemetry to our Coolify Microservice
-export function logTrainingTelemetryDb(data: TelemetryPayload) {
+// Turn-level governor summary: emitted once per user turn so we can
+// diagnose orchestration problems (premature stops, loop cut-offs).
+// `stopReason` is the key field — it records WHY the agent loop ended.
+export interface TurnSummaryPayload {
+  recordType: "turn_summary";
+  projectId?: string;
+  sessionId?: string;
+  userMessage?: string;
+  model?: string;
+  response?: { text: string; thoughts: string; toolCalls: any[] };
+  toolResults?: any[];
+  stopReason?: string;
+  rounds?: number;
+  toolCallCount?: number;
+  turnIntent?: string;
+  chatMode?: string;
+  metrics?: { durationMs: number };
+}
+
+function postTelemetry(body: unknown) {
  setTimeout(async () => {
    try {
      const telemetryUrl = process.env.TELEMETRY_SERVICE_URL;
+      if (!telemetryUrl) return; // silently skip when unconfigured

-      if (!telemetryUrl) {
-        console.warn(
-          "[Telemetry] TELEMETRY_SERVICE_URL is not set. Skipping log.",
-        );
-        return;
-      }
-
-      await fetch(`${telemetryUrl}/ingest`, {
+      await fetch(`${telemetryUrl.replace(/\/$/, "")}/ingest`, {
        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify(data),
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
      });
    } catch (error) {
      console.error(
-        "[Telemetry] Failed to send training data to microservice:",
-        error,
+        "[Telemetry] Failed to send data to microservice:",
+        error instanceof Error ? error.message : String(error),
      );
    }
  }, 0);
 }
+
+// Fire and forget: one row per LLM call (training data).
+export function logTrainingTelemetryDb(data: TelemetryPayload) {
+  postTelemetry(data);
+}
+
+// Fire and forget: one row per user turn (orchestration diagnostics).
+export function logTurnSummary(data: Omit<TurnSummaryPayload, "recordType">) {
+  postTelemetry({ recordType: "turn_summary", ...data });
+}
--- a/lib/db/agent-telemetry-schema.sql
+++ b/lib/db/agent-telemetry-schema.sql
@@ -13,13 +13,25 @@
 CREATE TABLE IF NOT EXISTS agent_telemetry (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+  -- 'llm_call'  = one row per model call (training data)
+  -- 'turn_summary' = one row per user turn (orchestration diagnostics)
+  record_type TEXT NOT NULL DEFAULT 'llm_call',
  project_id VARCHAR(255),
+  session_id TEXT,            -- chat thread id; groups all calls of a conversation
+  user_message TEXT,         -- the user message that triggered the turn
  model_used VARCHAR(255) NOT NULL,
  system_prompt TEXT NOT NULL,
  chat_history JSONB NOT NULL,
  response_text TEXT,
  response_thoughts TEXT,
  tool_calls JSONB,
+  tool_results JSONB,        -- redacted tool outputs (turn_summary)
+  -- Orchestration / governor diagnostics (turn_summary rows)
+  stop_reason TEXT,          -- completed | round_cap | loop_detected:* | tool_failure | silent_rounds | user_aborted | empty_no_tools
+  rounds INTEGER,            -- how many tool-loop rounds the turn ran
+  tool_call_count INTEGER,   -- total tool calls executed in the turn
+  turn_intent TEXT,          -- conversational vs action intent classification
+  chat_mode TEXT,            -- vibe | collaborate | delegate
  prompt_tokens INTEGER,
  completion_tokens INTEGER,
  total_tokens INTEGER,
@@ -30,3 +42,26 @@ CREATE TABLE IF NOT EXISTS agent_telemetry (
 CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id);
 -- Index for chronological sorting
 CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC);
+-- Diagnostic indexes
+CREATE INDEX IF NOT EXISTS idx_agent_telemetry_session ON agent_telemetry(session_id);
+CREATE INDEX IF NOT EXISTS idx_agent_telemetry_stop ON agent_telemetry(stop_reason);
+CREATE INDEX IF NOT EXISTS idx_agent_telemetry_record_type ON agent_telemetry(record_type);
+
+-- =====================================================================
+-- Example diagnostic queries (the whole point of this instrumentation)
+-- =====================================================================
+--
+-- Distribution of how turns end (find premature-stop problems):
+--   SELECT stop_reason, COUNT(*), ROUND(AVG(rounds),1) AS avg_rounds
+--   FROM agent_telemetry WHERE record_type='turn_summary'
+--   GROUP BY stop_reason ORDER BY 2 DESC;
+--
+-- Turns the governor cut off early (rounds < 3 but tools were running):
+--   SELECT created_at, project_id, user_message, stop_reason, rounds, tool_call_count
+--   FROM agent_telemetry
+--   WHERE record_type='turn_summary' AND stop_reason <> 'completed' AND rounds < 3
+--   ORDER BY created_at DESC;
+--
+-- Replay the full trajectory of one turn:
+--   SELECT created_at, record_type, response_text, tool_calls
+--   FROM agent_telemetry WHERE session_id = '<thread_id>' ORDER BY created_at ASC;