/** * Text chunking for semantic search * * Splits large documents into smaller, semantically coherent chunks * suitable for vector embedding and retrieval. */ export interface TextChunk { /** Index of this chunk (0-based) */ index: number; /** The chunked text content */ text: string; /** Approximate token count (for reference) */ estimatedTokens: number; } export interface ChunkingOptions { /** Target maximum tokens per chunk (approximate) */ maxTokens?: number; /** Target maximum characters per chunk (fallback if no tokenizer) */ maxChars?: number; /** Overlap between chunks (in characters) */ overlapChars?: number; /** Whether to try preserving paragraph boundaries */ preserveParagraphs?: boolean; } const DEFAULT_OPTIONS: Required = { maxTokens: 800, maxChars: 3000, // Rough approximation: ~4 chars per token overlapChars: 200, preserveParagraphs: true, }; /** * Estimate token count from character count * * Uses a rough heuristic: 1 token ≈ 4 characters for English text. * For more accuracy, integrate a real tokenizer (e.g., tiktoken). */ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } /** * Split text into paragraphs, preserving empty lines as separators */ function splitIntoParagraphs(text: string): string[] { return text.split(/\n\n+/).filter((p) => p.trim().length > 0); } /** * Split text into sentences (simple heuristic) */ function splitIntoSentences(text: string): string[] { // Simple sentence boundary detection return text .split(/[.!?]+\s+/) .map((s) => s.trim()) .filter((s) => s.length > 0); } /** * Chunk text into semantic pieces suitable for embedding * * Strategy: * 1. Split by paragraphs (if preserveParagraphs = true) * 2. Group paragraphs/sentences until reaching maxTokens/maxChars * 3. Add overlap between chunks for context continuity * * @param content - Text to chunk * @param options - Chunking options * @returns Array of text chunks with metadata * * @example * ```typescript * const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 }); * for (const chunk of chunks) { * console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`); * await embedText(chunk.text); * } * ``` */ export function chunkText( content: string, options: ChunkingOptions = {} ): TextChunk[] { const opts = { ...DEFAULT_OPTIONS, ...options }; const chunks: TextChunk[] = []; if (!content || content.trim().length === 0) { return chunks; } // Clean up content const cleanedContent = content.trim(); // If content is small enough, return as single chunk if (estimateTokens(cleanedContent) <= opts.maxTokens) { return [ { index: 0, text: cleanedContent, estimatedTokens: estimateTokens(cleanedContent), }, ]; } // Split into paragraphs or sentences const units = opts.preserveParagraphs ? splitIntoParagraphs(cleanedContent) : splitIntoSentences(cleanedContent); if (units.length === 0) { return [ { index: 0, text: cleanedContent, estimatedTokens: estimateTokens(cleanedContent), }, ]; } let currentChunk = ''; let chunkIndex = 0; let previousOverlap = ''; for (let i = 0; i < units.length; i++) { const unit = units[i]; const potentialChunk = currentChunk ? `${currentChunk}\n\n${unit}` : `${previousOverlap}${unit}`; const potentialTokens = estimateTokens(potentialChunk); const potentialChars = potentialChunk.length; // Check if adding this unit would exceed limits if ( potentialTokens > opts.maxTokens || potentialChars > opts.maxChars ) { // Save current chunk if it has content if (currentChunk.length > 0) { chunks.push({ index: chunkIndex++, text: currentChunk, estimatedTokens: estimateTokens(currentChunk), }); // Prepare overlap for next chunk const overlapStart = Math.max( 0, currentChunk.length - opts.overlapChars ); previousOverlap = currentChunk.substring(overlapStart); if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) { // Try to start overlap at a word boundary const spaceIndex = previousOverlap.indexOf(' '); if (spaceIndex > 0) { previousOverlap = previousOverlap.substring(spaceIndex + 1); } } } // Start new chunk with current unit currentChunk = `${previousOverlap}${unit}`; } else { // Add unit to current chunk currentChunk = potentialChunk; } } // Add final chunk if it has content if (currentChunk.length > 0) { chunks.push({ index: chunkIndex++, text: currentChunk, estimatedTokens: estimateTokens(currentChunk), }); } console.log( `[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks` ); return chunks; } /** * Chunk text with code-aware splitting * * Preserves code blocks and tries to keep them intact. * Useful for chunking AI chat transcripts that contain code snippets. */ export function chunkTextWithCodeAwareness( content: string, options: ChunkingOptions = {} ): TextChunk[] { const opts = { ...DEFAULT_OPTIONS, ...options }; // Detect code blocks (triple backticks) const codeBlockRegex = /```[\s\S]*?```/g; const codeBlocks: { start: number; end: number; content: string }[] = []; let match; while ((match = codeBlockRegex.exec(content)) !== null) { codeBlocks.push({ start: match.index, end: match.index + match[0].length, content: match[0], }); } // If no code blocks, use standard chunking if (codeBlocks.length === 0) { return chunkText(content, options); } // Split content around code blocks const chunks: TextChunk[] = []; let chunkIndex = 0; let currentPosition = 0; for (const codeBlock of codeBlocks) { // Chunk text before code block const textBefore = content.substring(currentPosition, codeBlock.start); if (textBefore.trim().length > 0) { const textChunks = chunkText(textBefore, opts); for (const chunk of textChunks) { chunks.push({ ...chunk, index: chunkIndex++, }); } } // Add code block as its own chunk (or split if too large) const codeTokens = estimateTokens(codeBlock.content); if (codeTokens <= opts.maxTokens) { chunks.push({ index: chunkIndex++, text: codeBlock.content, estimatedTokens: codeTokens, }); } else { // Code block is too large, split by lines const codeLines = codeBlock.content.split('\n'); let currentCodeChunk = ''; for (const line of codeLines) { const potentialChunk = currentCodeChunk ? `${currentCodeChunk}\n${line}` : line; if (estimateTokens(potentialChunk) > opts.maxTokens) { if (currentCodeChunk.length > 0) { chunks.push({ index: chunkIndex++, text: currentCodeChunk, estimatedTokens: estimateTokens(currentCodeChunk), }); } currentCodeChunk = line; } else { currentCodeChunk = potentialChunk; } } if (currentCodeChunk.length > 0) { chunks.push({ index: chunkIndex++, text: currentCodeChunk, estimatedTokens: estimateTokens(currentCodeChunk), }); } } currentPosition = codeBlock.end; } // Chunk remaining text after last code block const textAfter = content.substring(currentPosition); if (textAfter.trim().length > 0) { const textChunks = chunkText(textAfter, opts); for (const chunk of textChunks) { chunks.push({ ...chunk, index: chunkIndex++, }); } } return chunks; }