vibn-frontend/lib/utils/document-chunker.ts

/**
 * Document Chunking Utility
 *
 * Splits large documents into manageable chunks for AI processing.
 * Uses semantic chunking with configurable overlap for better context.
 */

export interface ChunkMetadata {
  chunkIndex: number;
  totalChunks: number;
  startChar: number;
  endChar: number;
  tokenCount: number;
}

export interface DocumentChunk {
  content: string;
  metadata: ChunkMetadata;
}

export interface ChunkingOptions {
  maxChunkSize?: number;        // Maximum characters per chunk (default: 2000)
  chunkOverlap?: number;         // Overlap between chunks (default: 200)
  preserveParagraphs?: boolean;  // Try to keep paragraphs intact (default: true)
  preserveCodeBlocks?: boolean;  // Keep code blocks together (default: true)
}

const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
  maxChunkSize: 2000,
  chunkOverlap: 200,
  preserveParagraphs: true,
  preserveCodeBlocks: true,
};

/**
 * Estimate token count (rough approximation: 1 token ≈ 4 characters)
 */
function estimateTokens(text: string): number {
  return Math.ceil(text.length / 4);
}

/**
 * Find good split points (paragraph breaks, sentence boundaries)
 */
function findSplitPoint(text: string, idealSplit: number): number {
  // Try to split at paragraph break first
  const paragraphBreak = text.lastIndexOf('\n\n', idealSplit);
  if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) {
    return paragraphBreak + 2;
  }

  // Try sentence boundary
  const sentenceEnd = text.lastIndexOf('. ', idealSplit);
  if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) {
    return sentenceEnd + 2;
  }

  // Try any newline
  const newline = text.lastIndexOf('\n', idealSplit);
  if (newline > idealSplit - 200 && newline > 0) {
    return newline + 1;
  }

  // Last resort: split at space
  const space = text.lastIndexOf(' ', idealSplit);
  return space > 0 ? space + 1 : idealSplit;
}

/**
 * Extract code blocks to preserve them
 */
function extractCodeBlocks(text: string): { text: string; codeBlocks: Map<string, string> } {
  const codeBlocks = new Map<string, string>();
  let counter = 0;

  const processedText = text.replace(/```[\s\S]*?```/g, (match) => {
    const placeholder = `__CODE_BLOCK_${counter}__`;
    codeBlocks.set(placeholder, match);
    counter++;
    return placeholder;
  });

  return { text: processedText, codeBlocks };
}

/**
 * Restore code blocks
 */
function restoreCodeBlocks(text: string, codeBlocks: Map<string, string>): string {
  let result = text;
  codeBlocks.forEach((code, placeholder) => {
    result = result.replace(placeholder, code);
  });
  return result;
}

/**
 * Split a document into semantic chunks
 */
export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] {
  const opts = { ...DEFAULT_OPTIONS, ...options };
  const chunks: DocumentChunk[] = [];

  // Handle empty content
  if (!content || content.trim().length === 0) {
    return chunks;
  }

  // Extract code blocks if preserving them
  let processedContent = content;
  let codeBlocks = new Map<string, string>();

  if (opts.preserveCodeBlocks) {
    const extracted = extractCodeBlocks(content);
    processedContent = extracted.text;
    codeBlocks = extracted.codeBlocks;
  }

  let position = 0;
  let chunkIndex = 0;

  while (position < processedContent.length) {
    const remainingLength = processedContent.length - position;

    // If remaining content fits in one chunk, take it all
    if (remainingLength <= opts.maxChunkSize) {
      const chunkContent = processedContent.substring(position);
      const finalContent = opts.preserveCodeBlocks
        ? restoreCodeBlocks(chunkContent, codeBlocks)
        : chunkContent;

      chunks.push({
        content: finalContent.trim(),
        metadata: {
          chunkIndex,
          totalChunks: 0, // Will be updated after loop
          startChar: position,
          endChar: processedContent.length,
          tokenCount: estimateTokens(finalContent),
        },
      });
      break;
    }

    // Find a good split point
    const idealEnd = position + opts.maxChunkSize;
    const actualEnd = findSplitPoint(processedContent, idealEnd);

    const chunkContent = processedContent.substring(position, actualEnd);
    const finalContent = opts.preserveCodeBlocks
      ? restoreCodeBlocks(chunkContent, codeBlocks)
      : chunkContent;

    chunks.push({
      content: finalContent.trim(),
      metadata: {
        chunkIndex,
        totalChunks: 0, // Will be updated after loop
        startChar: position,
        endChar: actualEnd,
        tokenCount: estimateTokens(finalContent),
      },
    });

    // Move position forward with overlap
    position = actualEnd - opts.chunkOverlap;
    chunkIndex++;
  }

  // Update totalChunks in all metadata
  const totalChunks = chunks.length;
  chunks.forEach((chunk) => {
    chunk.metadata.totalChunks = totalChunks;
  });

  return chunks;
}

/**
 * Chunk multiple documents and return with source tracking
 */
export interface SourcedChunk extends DocumentChunk {
  sourceFilename: string;
  sourceMimeType?: string;
}

export function chunkDocuments(
  documents: Array<{ filename: string; content: string; mimeType?: string }>,
  options: ChunkingOptions = {}
): SourcedChunk[] {
  const allChunks: SourcedChunk[] = [];

  documents.forEach((doc) => {
    const chunks = chunkDocument(doc.content, options);
    chunks.forEach((chunk) => {
      allChunks.push({
        ...chunk,
        sourceFilename: doc.filename,
        sourceMimeType: doc.mimeType,
      });
    });
  });

  return allChunks;
}