224 lines
5.5 KiB
TypeScript
224 lines
5.5 KiB
TypeScript
/**
|
|
* Code-specific chunking for source code files
|
|
* Intelligently splits code while preserving context
|
|
*/
|
|
|
|
export interface CodeChunk {
|
|
content: string;
|
|
metadata: {
|
|
chunkIndex: number;
|
|
totalChunks: number;
|
|
startLine: number;
|
|
endLine: number;
|
|
tokenCount: number;
|
|
filePath: string;
|
|
language?: string;
|
|
};
|
|
}
|
|
|
|
export interface CodeChunkOptions {
|
|
maxChunkSize?: number; // characters
|
|
chunkOverlap?: number; // lines
|
|
preserveFunctions?: boolean;
|
|
preserveClasses?: boolean;
|
|
filePath: string;
|
|
}
|
|
|
|
/**
|
|
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
|
|
*/
|
|
function estimateTokens(text: string): number {
|
|
return Math.ceil(text.length / 4);
|
|
}
|
|
|
|
/**
|
|
* Detect language from file path
|
|
*/
|
|
function detectLanguage(filePath: string): string | undefined {
|
|
const ext = filePath.split('.').pop()?.toLowerCase();
|
|
const langMap: Record<string, string> = {
|
|
ts: 'typescript',
|
|
tsx: 'typescript',
|
|
js: 'javascript',
|
|
jsx: 'javascript',
|
|
py: 'python',
|
|
java: 'java',
|
|
go: 'go',
|
|
rs: 'rust',
|
|
cpp: 'cpp',
|
|
c: 'c',
|
|
cs: 'csharp',
|
|
rb: 'ruby',
|
|
php: 'php',
|
|
swift: 'swift',
|
|
kt: 'kotlin',
|
|
sql: 'sql',
|
|
css: 'css',
|
|
scss: 'scss',
|
|
html: 'html',
|
|
json: 'json',
|
|
yaml: 'yaml',
|
|
yml: 'yaml',
|
|
md: 'markdown',
|
|
};
|
|
return langMap[ext || ''];
|
|
}
|
|
|
|
/**
|
|
* Chunk source code file intelligently
|
|
*/
|
|
export function chunkCode(
|
|
content: string,
|
|
options: CodeChunkOptions
|
|
): CodeChunk[] {
|
|
const {
|
|
maxChunkSize = 3000, // Larger chunks for code context
|
|
chunkOverlap = 5,
|
|
preserveFunctions = true,
|
|
preserveClasses = true,
|
|
filePath,
|
|
} = options;
|
|
|
|
const language = detectLanguage(filePath);
|
|
const lines = content.split('\n');
|
|
|
|
// For small files, return as single chunk
|
|
if (content.length <= maxChunkSize) {
|
|
return [
|
|
{
|
|
content,
|
|
metadata: {
|
|
chunkIndex: 0,
|
|
totalChunks: 1,
|
|
startLine: 1,
|
|
endLine: lines.length,
|
|
tokenCount: estimateTokens(content),
|
|
filePath,
|
|
language,
|
|
},
|
|
},
|
|
];
|
|
}
|
|
|
|
// For larger files, split by logical boundaries
|
|
const chunks: CodeChunk[] = [];
|
|
let currentChunk: string[] = [];
|
|
let currentSize = 0;
|
|
let chunkStartLine = 1;
|
|
|
|
// Patterns for detecting logical boundaries
|
|
const functionPattern = /^\s*(function|def|fn|func|fun|public|private|protected|static|async|export)\s/;
|
|
const classPattern = /^\s*(class|interface|struct|enum|type)\s/;
|
|
const importPattern = /^\s*(import|from|require|using|include)\s/;
|
|
const commentPattern = /^\s*(\/\/|\/\*|\*|#|--|<!--)/;
|
|
|
|
// Always include file header (imports, comments at top)
|
|
let headerLines: string[] = [];
|
|
for (let i = 0; i < Math.min(20, lines.length); i++) {
|
|
const line = lines[i];
|
|
if (importPattern.test(line) || commentPattern.test(line) || line.trim() === '') {
|
|
headerLines.push(line);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i];
|
|
const lineSize = line.length + 1; // +1 for newline
|
|
|
|
// Check if we should start a new chunk
|
|
const shouldSplit =
|
|
currentSize + lineSize > maxChunkSize &&
|
|
currentChunk.length > 0 &&
|
|
(functionPattern.test(line) ||
|
|
classPattern.test(line) ||
|
|
(line.trim() === '' && currentSize > maxChunkSize * 0.7));
|
|
|
|
if (shouldSplit) {
|
|
// Save current chunk
|
|
const chunkContent = currentChunk.join('\n');
|
|
chunks.push({
|
|
content: chunkContent,
|
|
metadata: {
|
|
chunkIndex: chunks.length,
|
|
totalChunks: 0, // Will update at end
|
|
startLine: chunkStartLine,
|
|
endLine: chunkStartLine + currentChunk.length - 1,
|
|
tokenCount: estimateTokens(chunkContent),
|
|
filePath,
|
|
language,
|
|
},
|
|
});
|
|
|
|
// Start new chunk with overlap and header
|
|
const overlapStart = Math.max(0, currentChunk.length - chunkOverlap);
|
|
currentChunk = [
|
|
...headerLines,
|
|
'',
|
|
`// ... continued from line ${chunkStartLine}`,
|
|
'',
|
|
...currentChunk.slice(overlapStart),
|
|
];
|
|
currentSize = currentChunk.reduce((sum, l) => sum + l.length + 1, 0);
|
|
chunkStartLine = chunkStartLine + overlapStart;
|
|
}
|
|
|
|
currentChunk.push(line);
|
|
currentSize += lineSize;
|
|
}
|
|
|
|
// Add final chunk
|
|
if (currentChunk.length > 0) {
|
|
const chunkContent = currentChunk.join('\n');
|
|
chunks.push({
|
|
content: chunkContent,
|
|
metadata: {
|
|
chunkIndex: chunks.length,
|
|
totalChunks: 0,
|
|
startLine: chunkStartLine,
|
|
endLine: lines.length,
|
|
tokenCount: estimateTokens(chunkContent),
|
|
filePath,
|
|
language,
|
|
},
|
|
});
|
|
}
|
|
|
|
// Update totalChunks for all chunks
|
|
chunks.forEach((chunk) => {
|
|
chunk.metadata.totalChunks = chunks.length;
|
|
});
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Generate a summary header for a code file
|
|
*/
|
|
export function generateCodeSummary(
|
|
filePath: string,
|
|
content: string,
|
|
language?: string
|
|
): string {
|
|
const lines = content.split('\n');
|
|
const functions = lines.filter(line => /^\s*(function|def|fn|func|async function|export function)/.test(line));
|
|
const classes = lines.filter(line => /^\s*(class|interface|struct|enum|type)\s/.test(line));
|
|
|
|
let summary = `File: ${filePath}\n`;
|
|
if (language) {
|
|
summary += `Language: ${language}\n`;
|
|
}
|
|
summary += `Lines: ${lines.length}\n`;
|
|
|
|
if (functions.length > 0) {
|
|
summary += `Functions: ${functions.length}\n`;
|
|
}
|
|
if (classes.length > 0) {
|
|
summary += `Classes/Types: ${classes.length}\n`;
|
|
}
|
|
|
|
return summary;
|
|
}
|
|
|