VIBN Frontend for Coolify deployment
This commit is contained in:
206
lib/utils/document-chunker.ts
Normal file
206
lib/utils/document-chunker.ts
Normal file
@@ -0,0 +1,206 @@
|
||||
/**
|
||||
* Document Chunking Utility
|
||||
*
|
||||
* Splits large documents into manageable chunks for AI processing.
|
||||
* Uses semantic chunking with configurable overlap for better context.
|
||||
*/
|
||||
|
||||
export interface ChunkMetadata {
|
||||
chunkIndex: number;
|
||||
totalChunks: number;
|
||||
startChar: number;
|
||||
endChar: number;
|
||||
tokenCount: number;
|
||||
}
|
||||
|
||||
export interface DocumentChunk {
|
||||
content: string;
|
||||
metadata: ChunkMetadata;
|
||||
}
|
||||
|
||||
export interface ChunkingOptions {
|
||||
maxChunkSize?: number; // Maximum characters per chunk (default: 2000)
|
||||
chunkOverlap?: number; // Overlap between chunks (default: 200)
|
||||
preserveParagraphs?: boolean; // Try to keep paragraphs intact (default: true)
|
||||
preserveCodeBlocks?: boolean; // Keep code blocks together (default: true)
|
||||
}
|
||||
|
||||
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
|
||||
maxChunkSize: 2000,
|
||||
chunkOverlap: 200,
|
||||
preserveParagraphs: true,
|
||||
preserveCodeBlocks: true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
|
||||
*/
|
||||
function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find good split points (paragraph breaks, sentence boundaries)
|
||||
*/
|
||||
function findSplitPoint(text: string, idealSplit: number): number {
|
||||
// Try to split at paragraph break first
|
||||
const paragraphBreak = text.lastIndexOf('\n\n', idealSplit);
|
||||
if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) {
|
||||
return paragraphBreak + 2;
|
||||
}
|
||||
|
||||
// Try sentence boundary
|
||||
const sentenceEnd = text.lastIndexOf('. ', idealSplit);
|
||||
if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) {
|
||||
return sentenceEnd + 2;
|
||||
}
|
||||
|
||||
// Try any newline
|
||||
const newline = text.lastIndexOf('\n', idealSplit);
|
||||
if (newline > idealSplit - 200 && newline > 0) {
|
||||
return newline + 1;
|
||||
}
|
||||
|
||||
// Last resort: split at space
|
||||
const space = text.lastIndexOf(' ', idealSplit);
|
||||
return space > 0 ? space + 1 : idealSplit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract code blocks to preserve them
|
||||
*/
|
||||
function extractCodeBlocks(text: string): { text: string; codeBlocks: Map<string, string> } {
|
||||
const codeBlocks = new Map<string, string>();
|
||||
let counter = 0;
|
||||
|
||||
const processedText = text.replace(/```[\s\S]*?```/g, (match) => {
|
||||
const placeholder = `__CODE_BLOCK_${counter}__`;
|
||||
codeBlocks.set(placeholder, match);
|
||||
counter++;
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
return { text: processedText, codeBlocks };
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore code blocks
|
||||
*/
|
||||
function restoreCodeBlocks(text: string, codeBlocks: Map<string, string>): string {
|
||||
let result = text;
|
||||
codeBlocks.forEach((code, placeholder) => {
|
||||
result = result.replace(placeholder, code);
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a document into semantic chunks
|
||||
*/
|
||||
export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] {
|
||||
const opts = { ...DEFAULT_OPTIONS, ...options };
|
||||
const chunks: DocumentChunk[] = [];
|
||||
|
||||
// Handle empty content
|
||||
if (!content || content.trim().length === 0) {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Extract code blocks if preserving them
|
||||
let processedContent = content;
|
||||
let codeBlocks = new Map<string, string>();
|
||||
|
||||
if (opts.preserveCodeBlocks) {
|
||||
const extracted = extractCodeBlocks(content);
|
||||
processedContent = extracted.text;
|
||||
codeBlocks = extracted.codeBlocks;
|
||||
}
|
||||
|
||||
let position = 0;
|
||||
let chunkIndex = 0;
|
||||
|
||||
while (position < processedContent.length) {
|
||||
const remainingLength = processedContent.length - position;
|
||||
|
||||
// If remaining content fits in one chunk, take it all
|
||||
if (remainingLength <= opts.maxChunkSize) {
|
||||
const chunkContent = processedContent.substring(position);
|
||||
const finalContent = opts.preserveCodeBlocks
|
||||
? restoreCodeBlocks(chunkContent, codeBlocks)
|
||||
: chunkContent;
|
||||
|
||||
chunks.push({
|
||||
content: finalContent.trim(),
|
||||
metadata: {
|
||||
chunkIndex,
|
||||
totalChunks: 0, // Will be updated after loop
|
||||
startChar: position,
|
||||
endChar: processedContent.length,
|
||||
tokenCount: estimateTokens(finalContent),
|
||||
},
|
||||
});
|
||||
break;
|
||||
}
|
||||
|
||||
// Find a good split point
|
||||
const idealEnd = position + opts.maxChunkSize;
|
||||
const actualEnd = findSplitPoint(processedContent, idealEnd);
|
||||
|
||||
const chunkContent = processedContent.substring(position, actualEnd);
|
||||
const finalContent = opts.preserveCodeBlocks
|
||||
? restoreCodeBlocks(chunkContent, codeBlocks)
|
||||
: chunkContent;
|
||||
|
||||
chunks.push({
|
||||
content: finalContent.trim(),
|
||||
metadata: {
|
||||
chunkIndex,
|
||||
totalChunks: 0, // Will be updated after loop
|
||||
startChar: position,
|
||||
endChar: actualEnd,
|
||||
tokenCount: estimateTokens(finalContent),
|
||||
},
|
||||
});
|
||||
|
||||
// Move position forward with overlap
|
||||
position = actualEnd - opts.chunkOverlap;
|
||||
chunkIndex++;
|
||||
}
|
||||
|
||||
// Update totalChunks in all metadata
|
||||
const totalChunks = chunks.length;
|
||||
chunks.forEach((chunk) => {
|
||||
chunk.metadata.totalChunks = totalChunks;
|
||||
});
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk multiple documents and return with source tracking
|
||||
*/
|
||||
export interface SourcedChunk extends DocumentChunk {
|
||||
sourceFilename: string;
|
||||
sourceMimeType?: string;
|
||||
}
|
||||
|
||||
export function chunkDocuments(
|
||||
documents: Array<{ filename: string; content: string; mimeType?: string }>,
|
||||
options: ChunkingOptions = {}
|
||||
): SourcedChunk[] {
|
||||
const allChunks: SourcedChunk[] = [];
|
||||
|
||||
documents.forEach((doc) => {
|
||||
const chunks = chunkDocument(doc.content, options);
|
||||
chunks.forEach((chunk) => {
|
||||
allChunks.push({
|
||||
...chunk,
|
||||
sourceFilename: doc.filename,
|
||||
sourceMimeType: doc.mimeType,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
return allChunks;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user