Implement accumulate-then-act streaming for thinking models
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
import { NextResponse } from "next/server";
|
||||
import { requireWorkspacePrincipal } from "@/lib/auth/workspace-auth";
|
||||
import { query, queryOne } from "@/lib/db-postgres";
|
||||
import { callVibnChat } from "@/lib/ai/vibn-chat-model";
|
||||
import { callVibnChat, streamVibnChat } from "@/lib/ai/vibn-chat-model";
|
||||
import {
|
||||
VIBN_TOOL_DEFINITIONS,
|
||||
executeMcpTool,
|
||||
@@ -1165,8 +1165,8 @@ export async function POST(request: Request) {
|
||||
extraSystem += `\n\n[WARNING] You only have ${maxToolRounds - round} tool calls left before you are forcefully terminated. Stop exploring, make your final edits, and write your final response to the user NOW.`;
|
||||
}
|
||||
|
||||
// Execute tool calls and add results. OpenAI-compatible APIs
|
||||
const resp = await callVibnChat({
|
||||
// Execute tool calls and add results using accumulating stream.
|
||||
const stream = streamVibnChat({
|
||||
systemPrompt: systemPrompt + extraSystem,
|
||||
messages,
|
||||
tools: toolDefs,
|
||||
@@ -1175,14 +1175,48 @@ export async function POST(request: Request) {
|
||||
signal: clientSignal,
|
||||
});
|
||||
|
||||
const resp = {
|
||||
text: "",
|
||||
thoughts: "",
|
||||
toolCalls: [] as any[],
|
||||
error: undefined as string | undefined,
|
||||
};
|
||||
|
||||
for await (const chunk of stream) {
|
||||
if (aborted) break;
|
||||
|
||||
if (chunk.type === "thinking_delta" && chunk.text) {
|
||||
resp.thoughts += chunk.text;
|
||||
emit({ type: "thinking_delta", text: chunk.text });
|
||||
} else if (chunk.type === "text_delta" && chunk.text) {
|
||||
resp.text += chunk.text;
|
||||
emit({ type: "text_delta", text: chunk.text });
|
||||
} else if (chunk.type === "tool_calls" && chunk.toolCalls) {
|
||||
resp.toolCalls = chunk.toolCalls;
|
||||
} else if (chunk.type === "error" && chunk.error) {
|
||||
resp.error = chunk.error;
|
||||
}
|
||||
}
|
||||
|
||||
// If the model produced any thoughts or text, record them in the timeline once stream is complete.
|
||||
// (The UI handles the delta-rendering live, but we save the complete chunk to Postgres).
|
||||
if (resp.thoughts) {
|
||||
assistantTimeline.push({ kind: "thought", text: resp.thoughts });
|
||||
}
|
||||
if (resp.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + resp.text;
|
||||
assistantTextSegments.push(resp.text);
|
||||
assistantTimeline.push({ kind: "text", text: resp.text });
|
||||
roundsSinceText = 0;
|
||||
toolCallsSinceText = 0;
|
||||
} else if (resp.toolCalls.length) {
|
||||
roundsSinceText++;
|
||||
toolCallsSinceText += resp.toolCalls.length;
|
||||
}
|
||||
|
||||
// When the model first reaches for a mutation, advance the phase so
|
||||
// the UI reflects "Executing Code Edits". We deliberately do NOT force
|
||||
// a separate planning round or discard the edit (the old "C-08
|
||||
// checkpoint" dance) — that made the model plan, stall on an empty
|
||||
// turn, and never execute, and it seeded scope-creep via the forced
|
||||
// "verification plan". The agent edits directly; the post-loop
|
||||
// verification layer checks the result and drives any fixes.
|
||||
const requestedMutations = resp.toolCalls.filter((tc) =>
|
||||
// the UI reflects "Executing Code Edits".
|
||||
const requestedMutations = resp.toolCalls.filter((tc: any) =>
|
||||
[
|
||||
"fs_write",
|
||||
"fs_edit",
|
||||
@@ -1197,10 +1231,7 @@ export async function POST(request: Request) {
|
||||
emit({ type: "phase", phase, label: "Executing Code Edits" });
|
||||
}
|
||||
|
||||
// A Stop click aborts the in-flight generation, which surfaces here
|
||||
// as resp.error === "aborted". Treat it as a clean user stop (break to
|
||||
// the post-loop abort handling that persists the partial reply),
|
||||
// NOT as a fatal error shown to the user.
|
||||
// A Stop click aborts the in-flight generation
|
||||
if (resp.error === "aborted" || aborted) {
|
||||
aborted = true;
|
||||
break;
|
||||
@@ -1212,28 +1243,6 @@ export async function POST(request: Request) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Stream the model's reasoning narration as a separate SSE
|
||||
// event type. We pay for thinking tokens whether or not we
|
||||
// ask for them, so making them visible is free transparency
|
||||
// — and it cures the "tool tray with no narrative" feel.
|
||||
if (resp.thoughts) {
|
||||
assistantTimeline.push({ kind: "thought", text: resp.thoughts });
|
||||
emit({ type: "thinking", text: resp.thoughts });
|
||||
}
|
||||
|
||||
// Stream user-facing text to client.
|
||||
if (resp.text) {
|
||||
assistantText += (assistantText ? "\n\n" : "") + resp.text;
|
||||
assistantTextSegments.push(resp.text);
|
||||
assistantTimeline.push({ kind: "text", text: resp.text });
|
||||
emit({ type: "text", text: resp.text });
|
||||
roundsSinceText = 0;
|
||||
toolCallsSinceText = 0;
|
||||
} else if (resp.toolCalls.length) {
|
||||
roundsSinceText++;
|
||||
toolCallsSinceText += resp.toolCalls.length;
|
||||
}
|
||||
|
||||
// Announce tool calls
|
||||
for (const tc of resp.toolCalls) {
|
||||
assistantToolCalls.push(tc);
|
||||
|
||||
Reference in New Issue
Block a user