vibn-frontend/lib/ai/chunking.ts

/**
 * Text chunking for semantic search
 *
 * Splits large documents into smaller, semantically coherent chunks
 * suitable for vector embedding and retrieval.
 */

export interface TextChunk {
  /** Index of this chunk (0-based) */
  index: number;

  /** The chunked text content */
  text: string;

  /** Approximate token count (for reference) */
  estimatedTokens: number;
}

export interface ChunkingOptions {
  /** Target maximum tokens per chunk (approximate) */
  maxTokens?: number;

  /** Target maximum characters per chunk (fallback if no tokenizer) */
  maxChars?: number;

  /** Overlap between chunks (in characters) */
  overlapChars?: number;

  /** Whether to try preserving paragraph boundaries */
  preserveParagraphs?: boolean;
}

const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
  maxTokens: 800,
  maxChars: 3000, // Rough approximation: ~4 chars per token
  overlapChars: 200,
  preserveParagraphs: true,
};

/**
 * Estimate token count from character count
 *
 * Uses a rough heuristic: 1 token ≈ 4 characters for English text.
 * For more accuracy, integrate a real tokenizer (e.g., tiktoken).
 */
function estimateTokens(text: string): number {
  return Math.ceil(text.length / 4);
}

/**
 * Split text into paragraphs, preserving empty lines as separators
 */
function splitIntoParagraphs(text: string): string[] {
  return text.split(/\n\n+/).filter((p) => p.trim().length > 0);
}

/**
 * Split text into sentences (simple heuristic)
 */
function splitIntoSentences(text: string): string[] {
  // Simple sentence boundary detection
  return text
    .split(/[.!?]+\s+/)
    .map((s) => s.trim())
    .filter((s) => s.length > 0);
}

/**
 * Chunk text into semantic pieces suitable for embedding
 *
 * Strategy:
 * 1. Split by paragraphs (if preserveParagraphs = true)
 * 2. Group paragraphs/sentences until reaching maxTokens/maxChars
 * 3. Add overlap between chunks for context continuity
 *
 * @param content - Text to chunk
 * @param options - Chunking options
 * @returns Array of text chunks with metadata
 *
 * @example
 * ```typescript
 * const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 });
 * for (const chunk of chunks) {
 *   console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`);
 *   await embedText(chunk.text);
 * }
 * ```
 */
export function chunkText(
  content: string,
  options: ChunkingOptions = {}
): TextChunk[] {
  const opts = { ...DEFAULT_OPTIONS, ...options };
  const chunks: TextChunk[] = [];

  if (!content || content.trim().length === 0) {
    return chunks;
  }

  // Clean up content
  const cleanedContent = content.trim();

  // If content is small enough, return as single chunk
  if (estimateTokens(cleanedContent) <= opts.maxTokens) {
    return [
      {
        index: 0,
        text: cleanedContent,
        estimatedTokens: estimateTokens(cleanedContent),
      },
    ];
  }

  // Split into paragraphs or sentences
  const units = opts.preserveParagraphs
    ? splitIntoParagraphs(cleanedContent)
    : splitIntoSentences(cleanedContent);

  if (units.length === 0) {
    return [
      {
        index: 0,
        text: cleanedContent,
        estimatedTokens: estimateTokens(cleanedContent),
      },
    ];
  }

  let currentChunk = '';
  let chunkIndex = 0;
  let previousOverlap = '';

  for (let i = 0; i < units.length; i++) {
    const unit = units[i];
    const potentialChunk = currentChunk
      ? `${currentChunk}\n\n${unit}`
      : `${previousOverlap}${unit}`;

    const potentialTokens = estimateTokens(potentialChunk);
    const potentialChars = potentialChunk.length;

    // Check if adding this unit would exceed limits
    if (
      potentialTokens > opts.maxTokens ||
      potentialChars > opts.maxChars
    ) {
      // Save current chunk if it has content
      if (currentChunk.length > 0) {
        chunks.push({
          index: chunkIndex++,
          text: currentChunk,
          estimatedTokens: estimateTokens(currentChunk),
        });

        // Prepare overlap for next chunk
        const overlapStart = Math.max(
          0,
          currentChunk.length - opts.overlapChars
        );
        previousOverlap = currentChunk.substring(overlapStart);
        if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) {
          // Try to start overlap at a word boundary
          const spaceIndex = previousOverlap.indexOf(' ');
          if (spaceIndex > 0) {
            previousOverlap = previousOverlap.substring(spaceIndex + 1);
          }
        }
      }

      // Start new chunk with current unit
      currentChunk = `${previousOverlap}${unit}`;
    } else {
      // Add unit to current chunk
      currentChunk = potentialChunk;
    }
  }

  // Add final chunk if it has content
  if (currentChunk.length > 0) {
    chunks.push({
      index: chunkIndex++,
      text: currentChunk,
      estimatedTokens: estimateTokens(currentChunk),
    });
  }

  console.log(
    `[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks`
  );

  return chunks;
}

/**
 * Chunk text with code-aware splitting
 *
 * Preserves code blocks and tries to keep them intact.
 * Useful for chunking AI chat transcripts that contain code snippets.
 */
export function chunkTextWithCodeAwareness(
  content: string,
  options: ChunkingOptions = {}
): TextChunk[] {
  const opts = { ...DEFAULT_OPTIONS, ...options };

  // Detect code blocks (triple backticks)
  const codeBlockRegex = /```[\s\S]*?```/g;
  const codeBlocks: { start: number; end: number; content: string }[] = [];
  let match;

  while ((match = codeBlockRegex.exec(content)) !== null) {
    codeBlocks.push({
      start: match.index,
      end: match.index + match[0].length,
      content: match[0],
    });
  }

  // If no code blocks, use standard chunking
  if (codeBlocks.length === 0) {
    return chunkText(content, options);
  }

  // Split content around code blocks
  const chunks: TextChunk[] = [];
  let chunkIndex = 0;
  let currentPosition = 0;

  for (const codeBlock of codeBlocks) {
    // Chunk text before code block
    const textBefore = content.substring(currentPosition, codeBlock.start);
    if (textBefore.trim().length > 0) {
      const textChunks = chunkText(textBefore, opts);
      for (const chunk of textChunks) {
        chunks.push({
          ...chunk,
          index: chunkIndex++,
        });
      }
    }

    // Add code block as its own chunk (or split if too large)
    const codeTokens = estimateTokens(codeBlock.content);
    if (codeTokens <= opts.maxTokens) {
      chunks.push({
        index: chunkIndex++,
        text: codeBlock.content,
        estimatedTokens: codeTokens,
      });
    } else {
      // Code block is too large, split by lines
      const codeLines = codeBlock.content.split('\n');
      let currentCodeChunk = '';
      for (const line of codeLines) {
        const potentialChunk = currentCodeChunk
          ? `${currentCodeChunk}\n${line}`
          : line;
        if (estimateTokens(potentialChunk) > opts.maxTokens) {
          if (currentCodeChunk.length > 0) {
            chunks.push({
              index: chunkIndex++,
              text: currentCodeChunk,
              estimatedTokens: estimateTokens(currentCodeChunk),
            });
          }
          currentCodeChunk = line;
        } else {
          currentCodeChunk = potentialChunk;
        }
      }
      if (currentCodeChunk.length > 0) {
        chunks.push({
          index: chunkIndex++,
          text: currentCodeChunk,
          estimatedTokens: estimateTokens(currentCodeChunk),
        });
      }
    }

    currentPosition = codeBlock.end;
  }

  // Chunk remaining text after last code block
  const textAfter = content.substring(currentPosition);
  if (textAfter.trim().length > 0) {
    const textChunks = chunkText(textAfter, opts);
    for (const chunk of textChunks) {
      chunks.push({
        ...chunk,
        index: chunkIndex++,
      });
    }
  }

  return chunks;
}