Files
vibn-frontend/lib/utils/document-chunker.ts

207 lines
5.5 KiB
TypeScript

/**
* Document Chunking Utility
*
* Splits large documents into manageable chunks for AI processing.
* Uses semantic chunking with configurable overlap for better context.
*/
export interface ChunkMetadata {
chunkIndex: number;
totalChunks: number;
startChar: number;
endChar: number;
tokenCount: number;
}
export interface DocumentChunk {
content: string;
metadata: ChunkMetadata;
}
export interface ChunkingOptions {
maxChunkSize?: number; // Maximum characters per chunk (default: 2000)
chunkOverlap?: number; // Overlap between chunks (default: 200)
preserveParagraphs?: boolean; // Try to keep paragraphs intact (default: true)
preserveCodeBlocks?: boolean; // Keep code blocks together (default: true)
}
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
maxChunkSize: 2000,
chunkOverlap: 200,
preserveParagraphs: true,
preserveCodeBlocks: true,
};
/**
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
*/
function estimateTokens(text: string): number {
return Math.ceil(text.length / 4);
}
/**
* Find good split points (paragraph breaks, sentence boundaries)
*/
function findSplitPoint(text: string, idealSplit: number): number {
// Try to split at paragraph break first
const paragraphBreak = text.lastIndexOf('\n\n', idealSplit);
if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) {
return paragraphBreak + 2;
}
// Try sentence boundary
const sentenceEnd = text.lastIndexOf('. ', idealSplit);
if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) {
return sentenceEnd + 2;
}
// Try any newline
const newline = text.lastIndexOf('\n', idealSplit);
if (newline > idealSplit - 200 && newline > 0) {
return newline + 1;
}
// Last resort: split at space
const space = text.lastIndexOf(' ', idealSplit);
return space > 0 ? space + 1 : idealSplit;
}
/**
* Extract code blocks to preserve them
*/
function extractCodeBlocks(text: string): { text: string; codeBlocks: Map<string, string> } {
const codeBlocks = new Map<string, string>();
let counter = 0;
const processedText = text.replace(/```[\s\S]*?```/g, (match) => {
const placeholder = `__CODE_BLOCK_${counter}__`;
codeBlocks.set(placeholder, match);
counter++;
return placeholder;
});
return { text: processedText, codeBlocks };
}
/**
* Restore code blocks
*/
function restoreCodeBlocks(text: string, codeBlocks: Map<string, string>): string {
let result = text;
codeBlocks.forEach((code, placeholder) => {
result = result.replace(placeholder, code);
});
return result;
}
/**
* Split a document into semantic chunks
*/
export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] {
const opts = { ...DEFAULT_OPTIONS, ...options };
const chunks: DocumentChunk[] = [];
// Handle empty content
if (!content || content.trim().length === 0) {
return chunks;
}
// Extract code blocks if preserving them
let processedContent = content;
let codeBlocks = new Map<string, string>();
if (opts.preserveCodeBlocks) {
const extracted = extractCodeBlocks(content);
processedContent = extracted.text;
codeBlocks = extracted.codeBlocks;
}
let position = 0;
let chunkIndex = 0;
while (position < processedContent.length) {
const remainingLength = processedContent.length - position;
// If remaining content fits in one chunk, take it all
if (remainingLength <= opts.maxChunkSize) {
const chunkContent = processedContent.substring(position);
const finalContent = opts.preserveCodeBlocks
? restoreCodeBlocks(chunkContent, codeBlocks)
: chunkContent;
chunks.push({
content: finalContent.trim(),
metadata: {
chunkIndex,
totalChunks: 0, // Will be updated after loop
startChar: position,
endChar: processedContent.length,
tokenCount: estimateTokens(finalContent),
},
});
break;
}
// Find a good split point
const idealEnd = position + opts.maxChunkSize;
const actualEnd = findSplitPoint(processedContent, idealEnd);
const chunkContent = processedContent.substring(position, actualEnd);
const finalContent = opts.preserveCodeBlocks
? restoreCodeBlocks(chunkContent, codeBlocks)
: chunkContent;
chunks.push({
content: finalContent.trim(),
metadata: {
chunkIndex,
totalChunks: 0, // Will be updated after loop
startChar: position,
endChar: actualEnd,
tokenCount: estimateTokens(finalContent),
},
});
// Move position forward with overlap
position = actualEnd - opts.chunkOverlap;
chunkIndex++;
}
// Update totalChunks in all metadata
const totalChunks = chunks.length;
chunks.forEach((chunk) => {
chunk.metadata.totalChunks = totalChunks;
});
return chunks;
}
/**
* Chunk multiple documents and return with source tracking
*/
export interface SourcedChunk extends DocumentChunk {
sourceFilename: string;
sourceMimeType?: string;
}
export function chunkDocuments(
documents: Array<{ filename: string; content: string; mimeType?: string }>,
options: ChunkingOptions = {}
): SourcedChunk[] {
const allChunks: SourcedChunk[] = [];
documents.forEach((doc) => {
const chunks = chunkDocument(doc.content, options);
chunks.forEach((chunk) => {
allChunks.push({
...chunk,
sourceFilename: doc.filename,
sourceMimeType: doc.mimeType,
});
});
});
return allChunks;
}