VIBN Frontend for Coolify deployment

2026-02-15 19:25:52 -08:00
commit 40bf8428cd
398 changed files with 76513 additions and 0 deletions
--- a/lib/ai/chunking.ts
+++ b/lib/ai/chunking.ts
@@ -0,0 +1,297 @@
+/**
+ * Text chunking for semantic search
+ * 
+ * Splits large documents into smaller, semantically coherent chunks
+ * suitable for vector embedding and retrieval.
+ */
+
+export interface TextChunk {
+  /** Index of this chunk (0-based) */
+  index: number;
+  
+  /** The chunked text content */
+  text: string;
+  
+  /** Approximate token count (for reference) */
+  estimatedTokens: number;
+}
+
+export interface ChunkingOptions {
+  /** Target maximum tokens per chunk (approximate) */
+  maxTokens?: number;
+  
+  /** Target maximum characters per chunk (fallback if no tokenizer) */
+  maxChars?: number;
+  
+  /** Overlap between chunks (in characters) */
+  overlapChars?: number;
+  
+  /** Whether to try preserving paragraph boundaries */
+  preserveParagraphs?: boolean;
+}
+
+const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
+  maxTokens: 800,
+  maxChars: 3000, // Rough approximation: ~4 chars per token
+  overlapChars: 200,
+  preserveParagraphs: true,
+};
+
+/**
+ * Estimate token count from character count
+ * 
+ * Uses a rough heuristic: 1 token ≈ 4 characters for English text.
+ * For more accuracy, integrate a real tokenizer (e.g., tiktoken).
+ */
+function estimateTokens(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+/**
+ * Split text into paragraphs, preserving empty lines as separators
+ */
+function splitIntoParagraphs(text: string): string[] {
+  return text.split(/\n\n+/).filter((p) => p.trim().length > 0);
+}
+
+/**
+ * Split text into sentences (simple heuristic)
+ */
+function splitIntoSentences(text: string): string[] {
+  // Simple sentence boundary detection
+  return text
+    .split(/[.!?]+\s+/)
+    .map((s) => s.trim())
+    .filter((s) => s.length > 0);
+}
+
+/**
+ * Chunk text into semantic pieces suitable for embedding
+ * 
+ * Strategy:
+ * 1. Split by paragraphs (if preserveParagraphs = true)
+ * 2. Group paragraphs/sentences until reaching maxTokens/maxChars
+ * 3. Add overlap between chunks for context continuity
+ * 
+ * @param content - Text to chunk
+ * @param options - Chunking options
+ * @returns Array of text chunks with metadata
+ * 
+ * @example
+ * ```typescript
+ * const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 });
+ * for (const chunk of chunks) {
+ *   console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`);
+ *   await embedText(chunk.text);
+ * }
+ * ```
+ */
+export function chunkText(
+  content: string,
+  options: ChunkingOptions = {}
+): TextChunk[] {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+  const chunks: TextChunk[] = [];
+
+  if (!content || content.trim().length === 0) {
+    return chunks;
+  }
+
+  // Clean up content
+  const cleanedContent = content.trim();
+
+  // If content is small enough, return as single chunk
+  if (estimateTokens(cleanedContent) <= opts.maxTokens) {
+    return [
+      {
+        index: 0,
+        text: cleanedContent,
+        estimatedTokens: estimateTokens(cleanedContent),
+      },
+    ];
+  }
+
+  // Split into paragraphs or sentences
+  const units = opts.preserveParagraphs
+    ? splitIntoParagraphs(cleanedContent)
+    : splitIntoSentences(cleanedContent);
+
+  if (units.length === 0) {
+    return [
+      {
+        index: 0,
+        text: cleanedContent,
+        estimatedTokens: estimateTokens(cleanedContent),
+      },
+    ];
+  }
+
+  let currentChunk = '';
+  let chunkIndex = 0;
+  let previousOverlap = '';
+
+  for (let i = 0; i < units.length; i++) {
+    const unit = units[i];
+    const potentialChunk = currentChunk
+      ? `${currentChunk}\n\n${unit}`
+      : `${previousOverlap}${unit}`;
+
+    const potentialTokens = estimateTokens(potentialChunk);
+    const potentialChars = potentialChunk.length;
+
+    // Check if adding this unit would exceed limits
+    if (
+      potentialTokens > opts.maxTokens ||
+      potentialChars > opts.maxChars
+    ) {
+      // Save current chunk if it has content
+      if (currentChunk.length > 0) {
+        chunks.push({
+          index: chunkIndex++,
+          text: currentChunk,
+          estimatedTokens: estimateTokens(currentChunk),
+        });
+
+        // Prepare overlap for next chunk
+        const overlapStart = Math.max(
+          0,
+          currentChunk.length - opts.overlapChars
+        );
+        previousOverlap = currentChunk.substring(overlapStart);
+        if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) {
+          // Try to start overlap at a word boundary
+          const spaceIndex = previousOverlap.indexOf(' ');
+          if (spaceIndex > 0) {
+            previousOverlap = previousOverlap.substring(spaceIndex + 1);
+          }
+        }
+      }
+
+      // Start new chunk with current unit
+      currentChunk = `${previousOverlap}${unit}`;
+    } else {
+      // Add unit to current chunk
+      currentChunk = potentialChunk;
+    }
+  }
+
+  // Add final chunk if it has content
+  if (currentChunk.length > 0) {
+    chunks.push({
+      index: chunkIndex++,
+      text: currentChunk,
+      estimatedTokens: estimateTokens(currentChunk),
+    });
+  }
+
+  console.log(
+    `[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks`
+  );
+
+  return chunks;
+}
+
+/**
+ * Chunk text with code-aware splitting
+ * 
+ * Preserves code blocks and tries to keep them intact.
+ * Useful for chunking AI chat transcripts that contain code snippets.
+ */
+export function chunkTextWithCodeAwareness(
+  content: string,
+  options: ChunkingOptions = {}
+): TextChunk[] {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+
+  // Detect code blocks (triple backticks)
+  const codeBlockRegex = /```[\s\S]*?```/g;
+  const codeBlocks: { start: number; end: number; content: string }[] = [];
+  let match;
+
+  while ((match = codeBlockRegex.exec(content)) !== null) {
+    codeBlocks.push({
+      start: match.index,
+      end: match.index + match[0].length,
+      content: match[0],
+    });
+  }
+
+  // If no code blocks, use standard chunking
+  if (codeBlocks.length === 0) {
+    return chunkText(content, options);
+  }
+
+  // Split content around code blocks
+  const chunks: TextChunk[] = [];
+  let chunkIndex = 0;
+  let currentPosition = 0;
+
+  for (const codeBlock of codeBlocks) {
+    // Chunk text before code block
+    const textBefore = content.substring(currentPosition, codeBlock.start);
+    if (textBefore.trim().length > 0) {
+      const textChunks = chunkText(textBefore, opts);
+      for (const chunk of textChunks) {
+        chunks.push({
+          ...chunk,
+          index: chunkIndex++,
+        });
+      }
+    }
+
+    // Add code block as its own chunk (or split if too large)
+    const codeTokens = estimateTokens(codeBlock.content);
+    if (codeTokens <= opts.maxTokens) {
+      chunks.push({
+        index: chunkIndex++,
+        text: codeBlock.content,
+        estimatedTokens: codeTokens,
+      });
+    } else {
+      // Code block is too large, split by lines
+      const codeLines = codeBlock.content.split('\n');
+      let currentCodeChunk = '';
+      for (const line of codeLines) {
+        const potentialChunk = currentCodeChunk
+          ? `${currentCodeChunk}\n${line}`
+          : line;
+        if (estimateTokens(potentialChunk) > opts.maxTokens) {
+          if (currentCodeChunk.length > 0) {
+            chunks.push({
+              index: chunkIndex++,
+              text: currentCodeChunk,
+              estimatedTokens: estimateTokens(currentCodeChunk),
+            });
+          }
+          currentCodeChunk = line;
+        } else {
+          currentCodeChunk = potentialChunk;
+        }
+      }
+      if (currentCodeChunk.length > 0) {
+        chunks.push({
+          index: chunkIndex++,
+          text: currentCodeChunk,
+          estimatedTokens: estimateTokens(currentCodeChunk),
+        });
+      }
+    }
+
+    currentPosition = codeBlock.end;
+  }
+
+  // Chunk remaining text after last code block
+  const textAfter = content.substring(currentPosition);
+  if (textAfter.trim().length > 0) {
+    const textChunks = chunkText(textAfter, opts);
+    for (const chunk of textChunks) {
+      chunks.push({
+        ...chunk,
+        index: chunkIndex++,
+      });
+    }
+  }
+
+  return chunks;
+}
+