VIBN Frontend for Coolify deployment
This commit is contained in:
223
lib/utils/code-chunker.ts
Normal file
223
lib/utils/code-chunker.ts
Normal file
@@ -0,0 +1,223 @@
|
||||
/**
|
||||
* Code-specific chunking for source code files
|
||||
* Intelligently splits code while preserving context
|
||||
*/
|
||||
|
||||
export interface CodeChunk {
|
||||
content: string;
|
||||
metadata: {
|
||||
chunkIndex: number;
|
||||
totalChunks: number;
|
||||
startLine: number;
|
||||
endLine: number;
|
||||
tokenCount: number;
|
||||
filePath: string;
|
||||
language?: string;
|
||||
};
|
||||
}
|
||||
|
||||
export interface CodeChunkOptions {
|
||||
maxChunkSize?: number; // characters
|
||||
chunkOverlap?: number; // lines
|
||||
preserveFunctions?: boolean;
|
||||
preserveClasses?: boolean;
|
||||
filePath: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
|
||||
*/
|
||||
function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect language from file path
|
||||
*/
|
||||
function detectLanguage(filePath: string): string | undefined {
|
||||
const ext = filePath.split('.').pop()?.toLowerCase();
|
||||
const langMap: Record<string, string> = {
|
||||
ts: 'typescript',
|
||||
tsx: 'typescript',
|
||||
js: 'javascript',
|
||||
jsx: 'javascript',
|
||||
py: 'python',
|
||||
java: 'java',
|
||||
go: 'go',
|
||||
rs: 'rust',
|
||||
cpp: 'cpp',
|
||||
c: 'c',
|
||||
cs: 'csharp',
|
||||
rb: 'ruby',
|
||||
php: 'php',
|
||||
swift: 'swift',
|
||||
kt: 'kotlin',
|
||||
sql: 'sql',
|
||||
css: 'css',
|
||||
scss: 'scss',
|
||||
html: 'html',
|
||||
json: 'json',
|
||||
yaml: 'yaml',
|
||||
yml: 'yaml',
|
||||
md: 'markdown',
|
||||
};
|
||||
return langMap[ext || ''];
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk source code file intelligently
|
||||
*/
|
||||
export function chunkCode(
|
||||
content: string,
|
||||
options: CodeChunkOptions
|
||||
): CodeChunk[] {
|
||||
const {
|
||||
maxChunkSize = 3000, // Larger chunks for code context
|
||||
chunkOverlap = 5,
|
||||
preserveFunctions = true,
|
||||
preserveClasses = true,
|
||||
filePath,
|
||||
} = options;
|
||||
|
||||
const language = detectLanguage(filePath);
|
||||
const lines = content.split('\n');
|
||||
|
||||
// For small files, return as single chunk
|
||||
if (content.length <= maxChunkSize) {
|
||||
return [
|
||||
{
|
||||
content,
|
||||
metadata: {
|
||||
chunkIndex: 0,
|
||||
totalChunks: 1,
|
||||
startLine: 1,
|
||||
endLine: lines.length,
|
||||
tokenCount: estimateTokens(content),
|
||||
filePath,
|
||||
language,
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// For larger files, split by logical boundaries
|
||||
const chunks: CodeChunk[] = [];
|
||||
let currentChunk: string[] = [];
|
||||
let currentSize = 0;
|
||||
let chunkStartLine = 1;
|
||||
|
||||
// Patterns for detecting logical boundaries
|
||||
const functionPattern = /^\s*(function|def|fn|func|fun|public|private|protected|static|async|export)\s/;
|
||||
const classPattern = /^\s*(class|interface|struct|enum|type)\s/;
|
||||
const importPattern = /^\s*(import|from|require|using|include)\s/;
|
||||
const commentPattern = /^\s*(\/\/|\/\*|\*|#|--|<!--)/;
|
||||
|
||||
// Always include file header (imports, comments at top)
|
||||
let headerLines: string[] = [];
|
||||
for (let i = 0; i < Math.min(20, lines.length); i++) {
|
||||
const line = lines[i];
|
||||
if (importPattern.test(line) || commentPattern.test(line) || line.trim() === '') {
|
||||
headerLines.push(line);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
const lineSize = line.length + 1; // +1 for newline
|
||||
|
||||
// Check if we should start a new chunk
|
||||
const shouldSplit =
|
||||
currentSize + lineSize > maxChunkSize &&
|
||||
currentChunk.length > 0 &&
|
||||
(functionPattern.test(line) ||
|
||||
classPattern.test(line) ||
|
||||
(line.trim() === '' && currentSize > maxChunkSize * 0.7));
|
||||
|
||||
if (shouldSplit) {
|
||||
// Save current chunk
|
||||
const chunkContent = currentChunk.join('\n');
|
||||
chunks.push({
|
||||
content: chunkContent,
|
||||
metadata: {
|
||||
chunkIndex: chunks.length,
|
||||
totalChunks: 0, // Will update at end
|
||||
startLine: chunkStartLine,
|
||||
endLine: chunkStartLine + currentChunk.length - 1,
|
||||
tokenCount: estimateTokens(chunkContent),
|
||||
filePath,
|
||||
language,
|
||||
},
|
||||
});
|
||||
|
||||
// Start new chunk with overlap and header
|
||||
const overlapStart = Math.max(0, currentChunk.length - chunkOverlap);
|
||||
currentChunk = [
|
||||
...headerLines,
|
||||
'',
|
||||
`// ... continued from line ${chunkStartLine}`,
|
||||
'',
|
||||
...currentChunk.slice(overlapStart),
|
||||
];
|
||||
currentSize = currentChunk.reduce((sum, l) => sum + l.length + 1, 0);
|
||||
chunkStartLine = chunkStartLine + overlapStart;
|
||||
}
|
||||
|
||||
currentChunk.push(line);
|
||||
currentSize += lineSize;
|
||||
}
|
||||
|
||||
// Add final chunk
|
||||
if (currentChunk.length > 0) {
|
||||
const chunkContent = currentChunk.join('\n');
|
||||
chunks.push({
|
||||
content: chunkContent,
|
||||
metadata: {
|
||||
chunkIndex: chunks.length,
|
||||
totalChunks: 0,
|
||||
startLine: chunkStartLine,
|
||||
endLine: lines.length,
|
||||
tokenCount: estimateTokens(chunkContent),
|
||||
filePath,
|
||||
language,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
// Update totalChunks for all chunks
|
||||
chunks.forEach((chunk) => {
|
||||
chunk.metadata.totalChunks = chunks.length;
|
||||
});
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a summary header for a code file
|
||||
*/
|
||||
export function generateCodeSummary(
|
||||
filePath: string,
|
||||
content: string,
|
||||
language?: string
|
||||
): string {
|
||||
const lines = content.split('\n');
|
||||
const functions = lines.filter(line => /^\s*(function|def|fn|func|async function|export function)/.test(line));
|
||||
const classes = lines.filter(line => /^\s*(class|interface|struct|enum|type)\s/.test(line));
|
||||
|
||||
let summary = `File: ${filePath}\n`;
|
||||
if (language) {
|
||||
summary += `Language: ${language}\n`;
|
||||
}
|
||||
summary += `Lines: ${lines.length}\n`;
|
||||
|
||||
if (functions.length > 0) {
|
||||
summary += `Functions: ${functions.length}\n`;
|
||||
}
|
||||
if (classes.length > 0) {
|
||||
summary += `Classes/Types: ${classes.length}\n`;
|
||||
}
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user