feat(chat): expose Gemini's reasoning narration as a thinking pill

Today the chat shows ✓-icon tool trays with no narration between calls — the user has no idea WHY the AI just called fs_edit or ship. Meanwhile Gemini is producing 500-1000 chars of first-person reasoning per round ("Updating the Express Server: A Quick Production Deployment / Right, so we have a basic Express server here, nothing fancy. I need to get a new version live...") and billing us for those tokens — we just weren't asking for them. Three layers: 1. lib/ai/gemini-chat.ts - generationConfig.thinkingConfig.includeThoughts = true (default true, opt-out via includeThoughts: false). We're already paying for thinking tokens regardless of this flag — it just controls whether the model returns the human-readable summary or only the compressed signature. - callGeminiChat now returns { text, thoughts, toolCalls, finishReason } and the parser splits parts by `part.thought`. CRITICAL bug avoided: previously `if (part.text) text += ...` would have lumped thoughts into the chat bubble verbatim. - streamGeminiChat yields `{ type: 'thinking' }` for thought parts. 2. app/api/chat/route.ts - New SSE event: `data: {"type":"thinking","text":"..."}` - Emitted on every round alongside text + tool_start. - Recovery-summary branch also emits thoughts so even when the model produces no user-facing prose, the user sees the model's reasoning instead of dead silence. 3. components/vibn-chat/chat-panel.tsx - Message gains optional `thoughts` field (in-memory only — we do NOT persist thoughts to fs_chat_messages; they're ephemeral and cheap to drop). - New ThinkingBubble component: dashed-border italic pill above the assistant bubble, collapsed by default to show one-line preview, click to expand for full chain. Strips Gemini's "**Section Heading**" prefixes from the preview. - SSE handler accumulates thinking chunks onto the in-flight assistant message. UX impact: instead of staring at fs.read ✓ fs.edit ✓ ship ✓ icons, the user sees "Examining the target server file..." → "Shipping the twenty-crm project..." in real time. Costs zero additional tokens (we already paid for the thoughts). Cleanup: removed scripts/probe-gemini-raw.ts and scripts/probe-recovery-summary.ts — diagnostic scripts that identified this opportunity, no longer needed in-tree. Made-with: Cursor
2026-04-28 15:24:49 -07:00
parent 4f84a19e75
commit 4184baca77
3 changed files with 176 additions and 22 deletions
--- a/app/api/chat/route.ts
+++ b/app/api/chat/route.ts
@@ -7,8 +7,10 @@
 *
 * SSE event shapes:
 *   data: {"type":"text","text":"..."}
+ *   data: {"type":"thinking","text":"..."}    // model's first-person reasoning
 *   data: {"type":"tool_start","name":"...","args":{}}
 *   data: {"type":"tool_result","name":"...","result":"..."}
+ *   data: {"type":"aborted"}
 *   data: {"type":"done"}
 *   data: {"type":"error","error":"..."}
 */
@@ -54,7 +56,7 @@ async function ensureChatTables() {
  chatTablesReady = true;
 }

-function buildSystemPrompt(projects: any[], workspace: string): string {
+export function buildSystemPrompt(projects: any[], workspace: string): string {
  const projectsText = projects.length
    ? projects
        .map(
@@ -291,12 +293,20 @@ export async function POST(request: Request) {
            return;
          }

-          // Stream text to client
+          // Stream user-facing text to client
          if (resp.text) {
            assistantText += resp.text;
            emit({ type: 'text', text: resp.text });
          }

+          // Stream the model's reasoning narration as a separate SSE
+          // event type. We pay for thinking tokens whether or not we
+          // ask for them, so making them visible is free transparency
+          // — and it cures the "tool tray with no narrative" feel.
+          if (resp.thoughts) {
+            emit({ type: 'thinking', text: resp.thoughts });
+          }
+
          // Announce tool calls
          for (const tc of resp.toolCalls) {
            assistantToolCalls.push(tc);
@@ -366,6 +376,9 @@ export async function POST(request: Request) {
              assistantText += summary.text;
              emit({ type: 'text', text: summary.text });
            }
+            if (summary.thoughts) {
+              emit({ type: 'thinking', text: summary.thoughts });
+            }
          } catch {
            // Don't let a failed summary kill the stream.
          }
--- a/components/vibn-chat/chat-panel.tsx
+++ b/components/vibn-chat/chat-panel.tsx
@@ -31,6 +31,14 @@ interface Message {
  toolCalls?: { id: string; name: string; args: Record<string, unknown> }[];
  toolName?: string;
  createdAt?: string;
+  /**
+   * First-person reasoning narration streamed alongside tool calls.
+   * Rendered as collapsed italic text above the message bubble; the
+   * user can expand for the full chain of thought. Discarded on
+   * persistence (we pay tokens regardless, but the bytes aren't
+   * worth keeping in PG).
+   */
+  thoughts?: string;
 }

 interface ToolEvent {
@@ -80,6 +88,72 @@ function renderMarkdown(text: string): string {

 // ── Message bubble ────────────────────────────────────────────────────────────

+/**
+ * Strip the markdown-bold "**Section Heading**" lines that Gemini
+ * loves to start each thought with so the collapsed pill shows the
+ * actual sentence rather than "**Examining the Target Server File**".
+ * The full text is still available in the expanded view.
+ */
+function thoughtPreview(thoughts: string): string {
+  const stripped = thoughts
+    .replace(/^\s*\*\*[^*]+\*\*\s*/gm, "")
+    .replace(/\s+/g, " ")
+    .trim();
+  if (stripped.length <= 90) return stripped;
+  return stripped.slice(0, 87) + "…";
+}
+
+function ThinkingBubble({ thoughts }: { thoughts: string }) {
+  const [expanded, setExpanded] = useState(false);
+  const preview = thoughtPreview(thoughts);
+  if (!thoughts.trim()) return null;
+  return (
+    <div
+      onClick={() => setExpanded(v => !v)}
+      title={expanded ? "Click to collapse" : "Click to see full reasoning"}
+      style={{
+        display: "flex",
+        alignItems: expanded ? "flex-start" : "center",
+        gap: 8,
+        padding: "6px 12px",
+        margin: "4px 0",
+        background: "#faf8f5",
+        border: "1px dashed #e0dad0",
+        borderRadius: 8,
+        fontSize: "0.72rem",
+        color: "#8a847e",
+        fontStyle: "italic",
+        fontFamily: "var(--font-inter),ui-sans-serif,sans-serif",
+        cursor: "pointer",
+        userSelect: "text",
+        lineHeight: 1.55,
+      }}
+    >
+      <ChevronRight
+        style={{
+          width: 11,
+          height: 11,
+          flexShrink: 0,
+          marginTop: expanded ? 4 : 0,
+          transform: expanded ? "rotate(90deg)" : "none",
+          transition: "transform 0.15s",
+          color: "#b0a99e",
+        }}
+      />
+      {expanded ? (
+        <span
+          style={{ whiteSpace: "pre-wrap" }}
+          dangerouslySetInnerHTML={{ __html: renderMarkdown(thoughts) }}
+        />
+      ) : (
+        <span style={{ overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap", flex: 1 }}>
+          {preview}
+        </span>
+      )}
+    </div>
+  );
+}
+
 function MessageBubble({ msg }: { msg: Message }) {
  const isUser = msg.role === "user";
  return (
@@ -95,18 +169,26 @@ function MessageBubble({ msg }: { msg: Message }) {
      )}
      <div style={{
        maxWidth: "82%",
-        padding: isUser ? "9px 14px" : "10px 14px",
-        borderRadius: isUser ? "14px 14px 4px 14px" : "4px 14px 14px 14px",
-        background: isUser ? "#1a1a1a" : "#f7f4ef",
-        color: isUser ? "#fff" : "#1a1a1a",
-        fontSize: "0.84rem",
-        lineHeight: 1.6,
-        fontFamily: "var(--font-inter),ui-sans-serif,sans-serif",
+        display: "flex",
+        flexDirection: "column",
      }}>
-        {isUser ? (
-          <span style={{ whiteSpace: "pre-wrap" }}>{msg.content}</span>
-        ) : (
-          <span dangerouslySetInnerHTML={{ __html: renderMarkdown(msg.content) }} />
+        {!isUser && msg.thoughts && <ThinkingBubble thoughts={msg.thoughts} />}
+        {(msg.content || isUser) && (
+          <div style={{
+            padding: isUser ? "9px 14px" : "10px 14px",
+            borderRadius: isUser ? "14px 14px 4px 14px" : "4px 14px 14px 14px",
+            background: isUser ? "#1a1a1a" : "#f7f4ef",
+            color: isUser ? "#fff" : "#1a1a1a",
+            fontSize: "0.84rem",
+            lineHeight: 1.6,
+            fontFamily: "var(--font-inter),ui-sans-serif,sans-serif",
+          }}>
+            {isUser ? (
+              <span style={{ whiteSpace: "pre-wrap" }}>{msg.content}</span>
+            ) : (
+              <span dangerouslySetInnerHTML={{ __html: renderMarkdown(msg.content) }} />
+            )}
+          </div>
        )}
      </div>
    </div>
@@ -336,6 +418,21 @@ export function ChatPanel() {
              }
              return next;
            });
+          } else if (ev.type === "thinking" && ev.text) {
+            // Accumulate reasoning narration on the in-flight
+            // assistant message. The renderer collapses it by
+            // default and shows the latest sentence as a pill.
+            setMessages((prev) => {
+              const next = [...prev];
+              if (msgIndex >= 0 && next[msgIndex]) {
+                const existing = next[msgIndex].thoughts ?? "";
+                next[msgIndex] = {
+                  ...next[msgIndex],
+                  thoughts: existing + ev.text,
+                };
+              }
+              return next;
+            });
          } else if (ev.type === "tool_start") {
            setToolEvents((prev) => [...prev, { name: ev.name, status: "running" }]);
          } else if (ev.type === "tool_result") {
--- a/lib/ai/gemini-chat.ts
+++ b/lib/ai/gemini-chat.ts
@@ -37,7 +37,7 @@ export interface ToolDefinition {
 }

 export interface ChatChunk {
-  type: 'text' | 'tool_call' | 'done' | 'error';
+  type: 'text' | 'thinking' | 'tool_call' | 'done' | 'error';
  text?: string;
  toolCall?: ToolCall;
  error?: string;
@@ -98,11 +98,23 @@ function buildBody(opts: {
  messages: ChatMessage[];
  tools?: ToolDefinition[];
  temperature?: number;
+  /**
+   * Ask Gemini to return its thought summaries as parts marked
+   * `thought: true`. We pay for thinking tokens regardless; this just
+   * makes them visible so the UI can show "Reading server.js…",
+   * "Shipping to production…" between tool calls instead of leaving
+   * the user staring at a silent tool tray. Defaults to true.
+   */
+  includeThoughts?: boolean;
 }) {
  const body: any = {
    contents: toGeminiContents(opts.messages),
    systemInstruction: { parts: [{ text: opts.systemPrompt }] },
-    generationConfig: { temperature: opts.temperature ?? 0.7, maxOutputTokens: 8192 },
+    generationConfig: {
+      temperature: opts.temperature ?? 0.7,
+      maxOutputTokens: 8192,
+      thinkingConfig: { includeThoughts: opts.includeThoughts ?? true },
+    },
  };
  const fns = toGeminiFunctions(opts.tools ?? []);
  if (fns) body.tools = fns;
@@ -118,7 +130,15 @@ export async function callGeminiChat(opts: {
  messages: ChatMessage[];
  tools?: ToolDefinition[];
  temperature?: number;
-}): Promise<{ text: string; toolCalls: ToolCall[]; error?: string }> {
+  includeThoughts?: boolean;
+}): Promise<{
+  text: string;
+  /** First-person reasoning narration; meant for a "thinking" UI panel, not the main bubble. */
+  thoughts: string;
+  toolCalls: ToolCall[];
+  finishReason?: string;
+  error?: string;
+}> {
  const url = `${GEMINI_BASE_URL}/models/${GEMINI_MODEL}:generateContent?key=${GEMINI_API_KEY}`;

  let res: Response;
@@ -129,21 +149,41 @@ export async function callGeminiChat(opts: {
      body: JSON.stringify(buildBody(opts)),
    });
  } catch (e) {
-    return { text: '', toolCalls: [], error: `Network error: ${e instanceof Error ? e.message : String(e)}` };
+    return {
+      text: '',
+      thoughts: '',
+      toolCalls: [],
+      error: `Network error: ${e instanceof Error ? e.message : String(e)}`,
+    };
  }

  const data = await res.json().catch(() => ({}));
  if (!res.ok) {
    const msg = data?.error?.message || JSON.stringify(data).slice(0, 200);
-    return { text: '', toolCalls: [], error: `Gemini API error ${res.status}: ${msg}` };
+    return {
+      text: '',
+      thoughts: '',
+      toolCalls: [],
+      error: `Gemini API error ${res.status}: ${msg}`,
+    };
  }

-  const parts: any[] = data?.candidates?.[0]?.content?.parts ?? [];
+  const cand = data?.candidates?.[0];
+  const parts: any[] = cand?.content?.parts ?? [];
  let text = '';
+  let thoughts = '';
  const toolCalls: ToolCall[] = [];

  for (const part of parts) {
-    if (part.text) text += part.text;
+    if (part.text) {
+      // CRITICAL: Gemini tags reasoning parts with `thought: true`. If
+      // we lump them into `text` they leak into the chat bubble as if
+      // they were prose for the user — which is the opposite of what
+      // the user wants. Keep them in their own bucket so the route
+      // can stream them as a separate SSE event type.
+      if (part.thought) thoughts += part.text;
+      else text += part.text;
+    }
    if (part.functionCall) {
      toolCalls.push({
        id: part.functionCall.id || `tc-${Date.now()}-${Math.random().toString(36).slice(2)}`,
@@ -155,7 +195,7 @@ export async function callGeminiChat(opts: {
    }
  }

-  return { text, toolCalls };
+  return { text, thoughts, toolCalls, finishReason: cand?.finishReason };
 }

 /**
@@ -210,7 +250,11 @@ export async function* streamGeminiChat(opts: {
        try { chunk = JSON.parse(data); } catch { continue; }
        const parts = chunk?.candidates?.[0]?.content?.parts ?? [];
        for (const part of parts) {
-          if (part.text) yield { type: 'text', text: part.text };
+          if (part.text) {
+            yield part.thought
+              ? { type: 'thinking', text: part.text }
+              : { type: 'text', text: part.text };
+          }
        }
      }
    }