Implement accumulate-then-act streaming for thinking models

2026-06-15 12:13:46 -07:00
parent a0e2364481
commit 86da778721
5 changed files with 257 additions and 44 deletions
--- a/vibn-frontend/app/api/chat/route.ts
+++ b/vibn-frontend/app/api/chat/route.ts
@@ -17,7 +17,7 @@
 import { NextResponse } from "next/server";
 import { requireWorkspacePrincipal } from "@/lib/auth/workspace-auth";
 import { query, queryOne } from "@/lib/db-postgres";
-import { callVibnChat } from "@/lib/ai/vibn-chat-model";
+import { callVibnChat, streamVibnChat } from "@/lib/ai/vibn-chat-model";
 import {
  VIBN_TOOL_DEFINITIONS,
  executeMcpTool,
@@ -1165,8 +1165,8 @@ export async function POST(request: Request) {
            extraSystem += `\n\n[WARNING] You only have ${maxToolRounds - round} tool calls left before you are forcefully terminated. Stop exploring, make your final edits, and write your final response to the user NOW.`;
          }

-          // Execute tool calls and add results. OpenAI-compatible APIs
-          const resp = await callVibnChat({
+          // Execute tool calls and add results using accumulating stream.
+          const stream = streamVibnChat({
            systemPrompt: systemPrompt + extraSystem,
            messages,
            tools: toolDefs,
@@ -1175,14 +1175,48 @@ export async function POST(request: Request) {
            signal: clientSignal,
          });

+          const resp = {
+            text: "",
+            thoughts: "",
+            toolCalls: [] as any[],
+            error: undefined as string | undefined,
+          };
+
+          for await (const chunk of stream) {
+            if (aborted) break;
+            
+            if (chunk.type === "thinking_delta" && chunk.text) {
+              resp.thoughts += chunk.text;
+              emit({ type: "thinking_delta", text: chunk.text });
+            } else if (chunk.type === "text_delta" && chunk.text) {
+              resp.text += chunk.text;
+              emit({ type: "text_delta", text: chunk.text });
+            } else if (chunk.type === "tool_calls" && chunk.toolCalls) {
+              resp.toolCalls = chunk.toolCalls;
+            } else if (chunk.type === "error" && chunk.error) {
+              resp.error = chunk.error;
+            }
+          }
+
+          // If the model produced any thoughts or text, record them in the timeline once stream is complete.
+          // (The UI handles the delta-rendering live, but we save the complete chunk to Postgres).
+          if (resp.thoughts) {
+            assistantTimeline.push({ kind: "thought", text: resp.thoughts });
+          }
+          if (resp.text) {
+            assistantText += (assistantText ? "\n\n" : "") + resp.text;
+            assistantTextSegments.push(resp.text);
+            assistantTimeline.push({ kind: "text", text: resp.text });
+            roundsSinceText = 0;
+            toolCallsSinceText = 0;
+          } else if (resp.toolCalls.length) {
+            roundsSinceText++;
+            toolCallsSinceText += resp.toolCalls.length;
+          }
+
          // When the model first reaches for a mutation, advance the phase so
-          // the UI reflects "Executing Code Edits". We deliberately do NOT force
-          // a separate planning round or discard the edit (the old "C-08
-          // checkpoint" dance) — that made the model plan, stall on an empty
-          // turn, and never execute, and it seeded scope-creep via the forced
-          // "verification plan". The agent edits directly; the post-loop
-          // verification layer checks the result and drives any fixes.
-          const requestedMutations = resp.toolCalls.filter((tc) =>
+          // the UI reflects "Executing Code Edits".
+          const requestedMutations = resp.toolCalls.filter((tc: any) =>
            [
              "fs_write",
              "fs_edit",
@@ -1197,10 +1231,7 @@ export async function POST(request: Request) {
            emit({ type: "phase", phase, label: "Executing Code Edits" });
          }

-          // A Stop click aborts the in-flight generation, which surfaces here
-          // as resp.error === "aborted". Treat it as a clean user stop (break to
-          // the post-loop abort handling that persists the partial reply),
-          // NOT as a fatal error shown to the user.
+          // A Stop click aborts the in-flight generation
          if (resp.error === "aborted" || aborted) {
            aborted = true;
            break;
@@ -1212,28 +1243,6 @@ export async function POST(request: Request) {
            return;
          }

-          // Stream the model's reasoning narration as a separate SSE
-          // event type. We pay for thinking tokens whether or not we
-          // ask for them, so making them visible is free transparency
-          // — and it cures the "tool tray with no narrative" feel.
-          if (resp.thoughts) {
-            assistantTimeline.push({ kind: "thought", text: resp.thoughts });
-            emit({ type: "thinking", text: resp.thoughts });
-          }
-
-          // Stream user-facing text to client.
-          if (resp.text) {
-            assistantText += (assistantText ? "\n\n" : "") + resp.text;
-            assistantTextSegments.push(resp.text);
-            assistantTimeline.push({ kind: "text", text: resp.text });
-            emit({ type: "text", text: resp.text });
-            roundsSinceText = 0;
-            toolCallsSinceText = 0;
-          } else if (resp.toolCalls.length) {
-            roundsSinceText++;
-            toolCallsSinceText += resp.toolCalls.length;
-          }
-
          // Announce tool calls
          for (const tc of resp.toolCalls) {
            assistantToolCalls.push(tc);