VIBN Frontend for Coolify deployment

2026-02-15 19:25:52 -08:00
commit 40bf8428cd
398 changed files with 76513 additions and 0 deletions
--- a/lib/utils/document-chunker.ts
+++ b/lib/utils/document-chunker.ts
@@ -0,0 +1,206 @@
+/**
+ * Document Chunking Utility
+ * 
+ * Splits large documents into manageable chunks for AI processing.
+ * Uses semantic chunking with configurable overlap for better context.
+ */
+
+export interface ChunkMetadata {
+  chunkIndex: number;
+  totalChunks: number;
+  startChar: number;
+  endChar: number;
+  tokenCount: number;
+}
+
+export interface DocumentChunk {
+  content: string;
+  metadata: ChunkMetadata;
+}
+
+export interface ChunkingOptions {
+  maxChunkSize?: number;        // Maximum characters per chunk (default: 2000)
+  chunkOverlap?: number;         // Overlap between chunks (default: 200)
+  preserveParagraphs?: boolean;  // Try to keep paragraphs intact (default: true)
+  preserveCodeBlocks?: boolean;  // Keep code blocks together (default: true)
+}
+
+const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
+  maxChunkSize: 2000,
+  chunkOverlap: 200,
+  preserveParagraphs: true,
+  preserveCodeBlocks: true,
+};
+
+/**
+ * Estimate token count (rough approximation: 1 token ≈ 4 characters)
+ */
+function estimateTokens(text: string): number {
+  return Math.ceil(text.length / 4);
+}
+
+/**
+ * Find good split points (paragraph breaks, sentence boundaries)
+ */
+function findSplitPoint(text: string, idealSplit: number): number {
+  // Try to split at paragraph break first
+  const paragraphBreak = text.lastIndexOf('\n\n', idealSplit);
+  if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) {
+    return paragraphBreak + 2;
+  }
+
+  // Try sentence boundary
+  const sentenceEnd = text.lastIndexOf('. ', idealSplit);
+  if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) {
+    return sentenceEnd + 2;
+  }
+
+  // Try any newline
+  const newline = text.lastIndexOf('\n', idealSplit);
+  if (newline > idealSplit - 200 && newline > 0) {
+    return newline + 1;
+  }
+
+  // Last resort: split at space
+  const space = text.lastIndexOf(' ', idealSplit);
+  return space > 0 ? space + 1 : idealSplit;
+}
+
+/**
+ * Extract code blocks to preserve them
+ */
+function extractCodeBlocks(text: string): { text: string; codeBlocks: Map<string, string> } {
+  const codeBlocks = new Map<string, string>();
+  let counter = 0;
+
+  const processedText = text.replace(/```[\s\S]*?```/g, (match) => {
+    const placeholder = `__CODE_BLOCK_${counter}__`;
+    codeBlocks.set(placeholder, match);
+    counter++;
+    return placeholder;
+  });
+
+  return { text: processedText, codeBlocks };
+}
+
+/**
+ * Restore code blocks
+ */
+function restoreCodeBlocks(text: string, codeBlocks: Map<string, string>): string {
+  let result = text;
+  codeBlocks.forEach((code, placeholder) => {
+    result = result.replace(placeholder, code);
+  });
+  return result;
+}
+
+/**
+ * Split a document into semantic chunks
+ */
+export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+  const chunks: DocumentChunk[] = [];
+
+  // Handle empty content
+  if (!content || content.trim().length === 0) {
+    return chunks;
+  }
+
+  // Extract code blocks if preserving them
+  let processedContent = content;
+  let codeBlocks = new Map<string, string>();
+  
+  if (opts.preserveCodeBlocks) {
+    const extracted = extractCodeBlocks(content);
+    processedContent = extracted.text;
+    codeBlocks = extracted.codeBlocks;
+  }
+
+  let position = 0;
+  let chunkIndex = 0;
+
+  while (position < processedContent.length) {
+    const remainingLength = processedContent.length - position;
+    
+    // If remaining content fits in one chunk, take it all
+    if (remainingLength <= opts.maxChunkSize) {
+      const chunkContent = processedContent.substring(position);
+      const finalContent = opts.preserveCodeBlocks 
+        ? restoreCodeBlocks(chunkContent, codeBlocks)
+        : chunkContent;
+
+      chunks.push({
+        content: finalContent.trim(),
+        metadata: {
+          chunkIndex,
+          totalChunks: 0, // Will be updated after loop
+          startChar: position,
+          endChar: processedContent.length,
+          tokenCount: estimateTokens(finalContent),
+        },
+      });
+      break;
+    }
+
+    // Find a good split point
+    const idealEnd = position + opts.maxChunkSize;
+    const actualEnd = findSplitPoint(processedContent, idealEnd);
+    
+    const chunkContent = processedContent.substring(position, actualEnd);
+    const finalContent = opts.preserveCodeBlocks
+      ? restoreCodeBlocks(chunkContent, codeBlocks)
+      : chunkContent;
+
+    chunks.push({
+      content: finalContent.trim(),
+      metadata: {
+        chunkIndex,
+        totalChunks: 0, // Will be updated after loop
+        startChar: position,
+        endChar: actualEnd,
+        tokenCount: estimateTokens(finalContent),
+      },
+    });
+
+    // Move position forward with overlap
+    position = actualEnd - opts.chunkOverlap;
+    chunkIndex++;
+  }
+
+  // Update totalChunks in all metadata
+  const totalChunks = chunks.length;
+  chunks.forEach((chunk) => {
+    chunk.metadata.totalChunks = totalChunks;
+  });
+
+  return chunks;
+}
+
+/**
+ * Chunk multiple documents and return with source tracking
+ */
+export interface SourcedChunk extends DocumentChunk {
+  sourceFilename: string;
+  sourceMimeType?: string;
+}
+
+export function chunkDocuments(
+  documents: Array<{ filename: string; content: string; mimeType?: string }>,
+  options: ChunkingOptions = {}
+): SourcedChunk[] {
+  const allChunks: SourcedChunk[] = [];
+
+  documents.forEach((doc) => {
+    const chunks = chunkDocument(doc.content, options);
+    chunks.forEach((chunk) => {
+      allChunks.push({
+        ...chunk,
+        sourceFilename: doc.filename,
+        sourceMimeType: doc.mimeType,
+      });
+    });
+  });
+
+  return allChunks;
+}
+