/** * Code-specific chunking for source code files * Intelligently splits code while preserving context */ export interface CodeChunk { content: string; metadata: { chunkIndex: number; totalChunks: number; startLine: number; endLine: number; tokenCount: number; filePath: string; language?: string; }; } export interface CodeChunkOptions { maxChunkSize?: number; // characters chunkOverlap?: number; // lines preserveFunctions?: boolean; preserveClasses?: boolean; filePath: string; } /** * Estimate token count (rough approximation: 1 token ≈ 4 characters) */ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } /** * Detect language from file path */ function detectLanguage(filePath: string): string | undefined { const ext = filePath.split('.').pop()?.toLowerCase(); const langMap: Record = { ts: 'typescript', tsx: 'typescript', js: 'javascript', jsx: 'javascript', py: 'python', java: 'java', go: 'go', rs: 'rust', cpp: 'cpp', c: 'c', cs: 'csharp', rb: 'ruby', php: 'php', swift: 'swift', kt: 'kotlin', sql: 'sql', css: 'css', scss: 'scss', html: 'html', json: 'json', yaml: 'yaml', yml: 'yaml', md: 'markdown', }; return langMap[ext || '']; } /** * Chunk source code file intelligently */ export function chunkCode( content: string, options: CodeChunkOptions ): CodeChunk[] { const { maxChunkSize = 3000, // Larger chunks for code context chunkOverlap = 5, preserveFunctions = true, preserveClasses = true, filePath, } = options; const language = detectLanguage(filePath); const lines = content.split('\n'); // For small files, return as single chunk if (content.length <= maxChunkSize) { return [ { content, metadata: { chunkIndex: 0, totalChunks: 1, startLine: 1, endLine: lines.length, tokenCount: estimateTokens(content), filePath, language, }, }, ]; } // For larger files, split by logical boundaries const chunks: CodeChunk[] = []; let currentChunk: string[] = []; let currentSize = 0; let chunkStartLine = 1; // Patterns for detecting logical boundaries const functionPattern = /^\s*(function|def|fn|func|fun|public|private|protected|static|async|export)\s/; const classPattern = /^\s*(class|interface|struct|enum|type)\s/; const importPattern = /^\s*(import|from|require|using|include)\s/; const commentPattern = /^\s*(\/\/|\/\*|\*|#|--|