207 lines
5.5 KiB
TypeScript
207 lines
5.5 KiB
TypeScript
/**
|
|
* Document Chunking Utility
|
|
*
|
|
* Splits large documents into manageable chunks for AI processing.
|
|
* Uses semantic chunking with configurable overlap for better context.
|
|
*/
|
|
|
|
export interface ChunkMetadata {
|
|
chunkIndex: number;
|
|
totalChunks: number;
|
|
startChar: number;
|
|
endChar: number;
|
|
tokenCount: number;
|
|
}
|
|
|
|
export interface DocumentChunk {
|
|
content: string;
|
|
metadata: ChunkMetadata;
|
|
}
|
|
|
|
export interface ChunkingOptions {
|
|
maxChunkSize?: number; // Maximum characters per chunk (default: 2000)
|
|
chunkOverlap?: number; // Overlap between chunks (default: 200)
|
|
preserveParagraphs?: boolean; // Try to keep paragraphs intact (default: true)
|
|
preserveCodeBlocks?: boolean; // Keep code blocks together (default: true)
|
|
}
|
|
|
|
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
|
|
maxChunkSize: 2000,
|
|
chunkOverlap: 200,
|
|
preserveParagraphs: true,
|
|
preserveCodeBlocks: true,
|
|
};
|
|
|
|
/**
|
|
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
|
|
*/
|
|
function estimateTokens(text: string): number {
|
|
return Math.ceil(text.length / 4);
|
|
}
|
|
|
|
/**
|
|
* Find good split points (paragraph breaks, sentence boundaries)
|
|
*/
|
|
function findSplitPoint(text: string, idealSplit: number): number {
|
|
// Try to split at paragraph break first
|
|
const paragraphBreak = text.lastIndexOf('\n\n', idealSplit);
|
|
if (paragraphBreak > idealSplit - 500 && paragraphBreak > 0) {
|
|
return paragraphBreak + 2;
|
|
}
|
|
|
|
// Try sentence boundary
|
|
const sentenceEnd = text.lastIndexOf('. ', idealSplit);
|
|
if (sentenceEnd > idealSplit - 300 && sentenceEnd > 0) {
|
|
return sentenceEnd + 2;
|
|
}
|
|
|
|
// Try any newline
|
|
const newline = text.lastIndexOf('\n', idealSplit);
|
|
if (newline > idealSplit - 200 && newline > 0) {
|
|
return newline + 1;
|
|
}
|
|
|
|
// Last resort: split at space
|
|
const space = text.lastIndexOf(' ', idealSplit);
|
|
return space > 0 ? space + 1 : idealSplit;
|
|
}
|
|
|
|
/**
|
|
* Extract code blocks to preserve them
|
|
*/
|
|
function extractCodeBlocks(text: string): { text: string; codeBlocks: Map<string, string> } {
|
|
const codeBlocks = new Map<string, string>();
|
|
let counter = 0;
|
|
|
|
const processedText = text.replace(/```[\s\S]*?```/g, (match) => {
|
|
const placeholder = `__CODE_BLOCK_${counter}__`;
|
|
codeBlocks.set(placeholder, match);
|
|
counter++;
|
|
return placeholder;
|
|
});
|
|
|
|
return { text: processedText, codeBlocks };
|
|
}
|
|
|
|
/**
|
|
* Restore code blocks
|
|
*/
|
|
function restoreCodeBlocks(text: string, codeBlocks: Map<string, string>): string {
|
|
let result = text;
|
|
codeBlocks.forEach((code, placeholder) => {
|
|
result = result.replace(placeholder, code);
|
|
});
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Split a document into semantic chunks
|
|
*/
|
|
export function chunkDocument(content: string, options: ChunkingOptions = {}): DocumentChunk[] {
|
|
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
const chunks: DocumentChunk[] = [];
|
|
|
|
// Handle empty content
|
|
if (!content || content.trim().length === 0) {
|
|
return chunks;
|
|
}
|
|
|
|
// Extract code blocks if preserving them
|
|
let processedContent = content;
|
|
let codeBlocks = new Map<string, string>();
|
|
|
|
if (opts.preserveCodeBlocks) {
|
|
const extracted = extractCodeBlocks(content);
|
|
processedContent = extracted.text;
|
|
codeBlocks = extracted.codeBlocks;
|
|
}
|
|
|
|
let position = 0;
|
|
let chunkIndex = 0;
|
|
|
|
while (position < processedContent.length) {
|
|
const remainingLength = processedContent.length - position;
|
|
|
|
// If remaining content fits in one chunk, take it all
|
|
if (remainingLength <= opts.maxChunkSize) {
|
|
const chunkContent = processedContent.substring(position);
|
|
const finalContent = opts.preserveCodeBlocks
|
|
? restoreCodeBlocks(chunkContent, codeBlocks)
|
|
: chunkContent;
|
|
|
|
chunks.push({
|
|
content: finalContent.trim(),
|
|
metadata: {
|
|
chunkIndex,
|
|
totalChunks: 0, // Will be updated after loop
|
|
startChar: position,
|
|
endChar: processedContent.length,
|
|
tokenCount: estimateTokens(finalContent),
|
|
},
|
|
});
|
|
break;
|
|
}
|
|
|
|
// Find a good split point
|
|
const idealEnd = position + opts.maxChunkSize;
|
|
const actualEnd = findSplitPoint(processedContent, idealEnd);
|
|
|
|
const chunkContent = processedContent.substring(position, actualEnd);
|
|
const finalContent = opts.preserveCodeBlocks
|
|
? restoreCodeBlocks(chunkContent, codeBlocks)
|
|
: chunkContent;
|
|
|
|
chunks.push({
|
|
content: finalContent.trim(),
|
|
metadata: {
|
|
chunkIndex,
|
|
totalChunks: 0, // Will be updated after loop
|
|
startChar: position,
|
|
endChar: actualEnd,
|
|
tokenCount: estimateTokens(finalContent),
|
|
},
|
|
});
|
|
|
|
// Move position forward with overlap
|
|
position = actualEnd - opts.chunkOverlap;
|
|
chunkIndex++;
|
|
}
|
|
|
|
// Update totalChunks in all metadata
|
|
const totalChunks = chunks.length;
|
|
chunks.forEach((chunk) => {
|
|
chunk.metadata.totalChunks = totalChunks;
|
|
});
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Chunk multiple documents and return with source tracking
|
|
*/
|
|
export interface SourcedChunk extends DocumentChunk {
|
|
sourceFilename: string;
|
|
sourceMimeType?: string;
|
|
}
|
|
|
|
export function chunkDocuments(
|
|
documents: Array<{ filename: string; content: string; mimeType?: string }>,
|
|
options: ChunkingOptions = {}
|
|
): SourcedChunk[] {
|
|
const allChunks: SourcedChunk[] = [];
|
|
|
|
documents.forEach((doc) => {
|
|
const chunks = chunkDocument(doc.content, options);
|
|
chunks.forEach((chunk) => {
|
|
allChunks.push({
|
|
...chunk,
|
|
sourceFilename: doc.filename,
|
|
sourceMimeType: doc.mimeType,
|
|
});
|
|
});
|
|
});
|
|
|
|
return allChunks;
|
|
}
|
|
|