/** * Document Chunking Utility * * Splits large documents into manageable chunks for AI processing. * Uses semantic chunking with configurable overlap for better context. */ export interface ChunkMetadata { chunkIndex: number; totalChunks: number; startChar: number; endChar: number; tokenCount: number; } export interface DocumentChunk { content: string; metadata: ChunkMetadata; } export interface ChunkingOptions { maxChunkSize?: number; // Maximum characters per chunk (default: 2000) chunkOverlap?: number; // Overlap between chunks (default: 200) preserveParagraphs?: boolean; // Try to keep paragraphs intact (default: true) preserveCodeBlocks?: boolean; // Keep code blocks together (default: true) } const DEFAULT_OPTIONS: Required = { maxChunkSize: 2000, chunkOverlap: 200, preserveParagraphs: true, preserveCodeBlocks: true, }; /** * Estimate token count (rough approximation: 1 token ≈ 4 characters) */ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } /** * Find good split points (paragraph breaks, sentence boundaries) */ function findSplitPoint(text: string, idealSplit: number): number { // Try to split at paragraph break first const paragraphBreak = text.lastIndexOf('\n\n', idealSplit); if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) { return paragraphBreak + 2; } // Try sentence boundary const sentenceEnd = text.lastIndexOf('. ', idealSplit); if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) { return sentenceEnd + 2; } // Try any newline const newline = text.lastIndexOf('\n', idealSplit); if (newline > idealSplit - 200 && newline > 0) { return newline + 1; } // Last resort: split at space const space = text.lastIndexOf(' ', idealSplit); return space > 0 ? space + 1 : idealSplit; } /** * Extract code blocks to preserve them */ function extractCodeBlocks(text: string): { text: string; codeBlocks: Map } { const codeBlocks = new Map(); let counter = 0; const processedText = text.replace(/```[\s\S]*?```/g, (match) => { const placeholder = `__CODE_BLOCK_${counter}__`; codeBlocks.set(placeholder, match); counter++; return placeholder; }); return { text: processedText, codeBlocks }; } /** * Restore code blocks */ function restoreCodeBlocks(text: string, codeBlocks: Map): string { let result = text; codeBlocks.forEach((code, placeholder) => { result = result.replace(placeholder, code); }); return result; } /** * Split a document into semantic chunks */ export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] { const opts = { ...DEFAULT_OPTIONS, ...options }; const chunks: DocumentChunk[] = []; // Handle empty content if (!content || content.trim().length === 0) { return chunks; } // Extract code blocks if preserving them let processedContent = content; let codeBlocks = new Map(); if (opts.preserveCodeBlocks) { const extracted = extractCodeBlocks(content); processedContent = extracted.text; codeBlocks = extracted.codeBlocks; } let position = 0; let chunkIndex = 0; while (position < processedContent.length) { const remainingLength = processedContent.length - position; // If remaining content fits in one chunk, take it all if (remainingLength <= opts.maxChunkSize) { const chunkContent = processedContent.substring(position); const finalContent = opts.preserveCodeBlocks ? restoreCodeBlocks(chunkContent, codeBlocks) : chunkContent; chunks.push({ content: finalContent.trim(), metadata: { chunkIndex, totalChunks: 0, // Will be updated after loop startChar: position, endChar: processedContent.length, tokenCount: estimateTokens(finalContent), }, }); break; } // Find a good split point const idealEnd = position + opts.maxChunkSize; const actualEnd = findSplitPoint(processedContent, idealEnd); const chunkContent = processedContent.substring(position, actualEnd); const finalContent = opts.preserveCodeBlocks ? restoreCodeBlocks(chunkContent, codeBlocks) : chunkContent; chunks.push({ content: finalContent.trim(), metadata: { chunkIndex, totalChunks: 0, // Will be updated after loop startChar: position, endChar: actualEnd, tokenCount: estimateTokens(finalContent), }, }); // Move position forward with overlap position = actualEnd - opts.chunkOverlap; chunkIndex++; } // Update totalChunks in all metadata const totalChunks = chunks.length; chunks.forEach((chunk) => { chunk.metadata.totalChunks = totalChunks; }); return chunks; } /** * Chunk multiple documents and return with source tracking */ export interface SourcedChunk extends DocumentChunk { sourceFilename: string; sourceMimeType?: string; } export function chunkDocuments( documents: Array<{ filename: string; content: string; mimeType?: string }>, options: ChunkingOptions = {} ): SourcedChunk[] { const allChunks: SourcedChunk[] = []; documents.forEach((doc) => { const chunks = chunkDocument(doc.content, options); chunks.forEach((chunk) => { allChunks.push({ ...chunk, sourceFilename: doc.filename, sourceMimeType: doc.mimeType, }); }); }); return allChunks; }