298 lines
7.8 KiB
TypeScript
298 lines
7.8 KiB
TypeScript
/**
|
|
* Text chunking for semantic search
|
|
*
|
|
* Splits large documents into smaller, semantically coherent chunks
|
|
* suitable for vector embedding and retrieval.
|
|
*/
|
|
|
|
export interface TextChunk {
|
|
/** Index of this chunk (0-based) */
|
|
index: number;
|
|
|
|
/** The chunked text content */
|
|
text: string;
|
|
|
|
/** Approximate token count (for reference) */
|
|
estimatedTokens: number;
|
|
}
|
|
|
|
export interface ChunkingOptions {
|
|
/** Target maximum tokens per chunk (approximate) */
|
|
maxTokens?: number;
|
|
|
|
/** Target maximum characters per chunk (fallback if no tokenizer) */
|
|
maxChars?: number;
|
|
|
|
/** Overlap between chunks (in characters) */
|
|
overlapChars?: number;
|
|
|
|
/** Whether to try preserving paragraph boundaries */
|
|
preserveParagraphs?: boolean;
|
|
}
|
|
|
|
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
|
|
maxTokens: 800,
|
|
maxChars: 3000, // Rough approximation: ~4 chars per token
|
|
overlapChars: 200,
|
|
preserveParagraphs: true,
|
|
};
|
|
|
|
/**
|
|
* Estimate token count from character count
|
|
*
|
|
* Uses a rough heuristic: 1 token ≈ 4 characters for English text.
|
|
* For more accuracy, integrate a real tokenizer (e.g., tiktoken).
|
|
*/
|
|
function estimateTokens(text: string): number {
|
|
return Math.ceil(text.length / 4);
|
|
}
|
|
|
|
/**
|
|
* Split text into paragraphs, preserving empty lines as separators
|
|
*/
|
|
function splitIntoParagraphs(text: string): string[] {
|
|
return text.split(/\n\n+/).filter((p) => p.trim().length > 0);
|
|
}
|
|
|
|
/**
|
|
* Split text into sentences (simple heuristic)
|
|
*/
|
|
function splitIntoSentences(text: string): string[] {
|
|
// Simple sentence boundary detection
|
|
return text
|
|
.split(/[.!?]+\s+/)
|
|
.map((s) => s.trim())
|
|
.filter((s) => s.length > 0);
|
|
}
|
|
|
|
/**
|
|
* Chunk text into semantic pieces suitable for embedding
|
|
*
|
|
* Strategy:
|
|
* 1. Split by paragraphs (if preserveParagraphs = true)
|
|
* 2. Group paragraphs/sentences until reaching maxTokens/maxChars
|
|
* 3. Add overlap between chunks for context continuity
|
|
*
|
|
* @param content - Text to chunk
|
|
* @param options - Chunking options
|
|
* @returns Array of text chunks with metadata
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 });
|
|
* for (const chunk of chunks) {
|
|
* console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`);
|
|
* await embedText(chunk.text);
|
|
* }
|
|
* ```
|
|
*/
|
|
export function chunkText(
|
|
content: string,
|
|
options: ChunkingOptions = {}
|
|
): TextChunk[] {
|
|
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
const chunks: TextChunk[] = [];
|
|
|
|
if (!content || content.trim().length === 0) {
|
|
return chunks;
|
|
}
|
|
|
|
// Clean up content
|
|
const cleanedContent = content.trim();
|
|
|
|
// If content is small enough, return as single chunk
|
|
if (estimateTokens(cleanedContent) <= opts.maxTokens) {
|
|
return [
|
|
{
|
|
index: 0,
|
|
text: cleanedContent,
|
|
estimatedTokens: estimateTokens(cleanedContent),
|
|
},
|
|
];
|
|
}
|
|
|
|
// Split into paragraphs or sentences
|
|
const units = opts.preserveParagraphs
|
|
? splitIntoParagraphs(cleanedContent)
|
|
: splitIntoSentences(cleanedContent);
|
|
|
|
if (units.length === 0) {
|
|
return [
|
|
{
|
|
index: 0,
|
|
text: cleanedContent,
|
|
estimatedTokens: estimateTokens(cleanedContent),
|
|
},
|
|
];
|
|
}
|
|
|
|
let currentChunk = '';
|
|
let chunkIndex = 0;
|
|
let previousOverlap = '';
|
|
|
|
for (let i = 0; i < units.length; i++) {
|
|
const unit = units[i];
|
|
const potentialChunk = currentChunk
|
|
? `${currentChunk}\n\n${unit}`
|
|
: `${previousOverlap}${unit}`;
|
|
|
|
const potentialTokens = estimateTokens(potentialChunk);
|
|
const potentialChars = potentialChunk.length;
|
|
|
|
// Check if adding this unit would exceed limits
|
|
if (
|
|
potentialTokens > opts.maxTokens ||
|
|
potentialChars > opts.maxChars
|
|
) {
|
|
// Save current chunk if it has content
|
|
if (currentChunk.length > 0) {
|
|
chunks.push({
|
|
index: chunkIndex++,
|
|
text: currentChunk,
|
|
estimatedTokens: estimateTokens(currentChunk),
|
|
});
|
|
|
|
// Prepare overlap for next chunk
|
|
const overlapStart = Math.max(
|
|
0,
|
|
currentChunk.length - opts.overlapChars
|
|
);
|
|
previousOverlap = currentChunk.substring(overlapStart);
|
|
if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) {
|
|
// Try to start overlap at a word boundary
|
|
const spaceIndex = previousOverlap.indexOf(' ');
|
|
if (spaceIndex > 0) {
|
|
previousOverlap = previousOverlap.substring(spaceIndex + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start new chunk with current unit
|
|
currentChunk = `${previousOverlap}${unit}`;
|
|
} else {
|
|
// Add unit to current chunk
|
|
currentChunk = potentialChunk;
|
|
}
|
|
}
|
|
|
|
// Add final chunk if it has content
|
|
if (currentChunk.length > 0) {
|
|
chunks.push({
|
|
index: chunkIndex++,
|
|
text: currentChunk,
|
|
estimatedTokens: estimateTokens(currentChunk),
|
|
});
|
|
}
|
|
|
|
console.log(
|
|
`[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks`
|
|
);
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Chunk text with code-aware splitting
|
|
*
|
|
* Preserves code blocks and tries to keep them intact.
|
|
* Useful for chunking AI chat transcripts that contain code snippets.
|
|
*/
|
|
export function chunkTextWithCodeAwareness(
|
|
content: string,
|
|
options: ChunkingOptions = {}
|
|
): TextChunk[] {
|
|
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
|
|
// Detect code blocks (triple backticks)
|
|
const codeBlockRegex = /```[\s\S]*?```/g;
|
|
const codeBlocks: { start: number; end: number; content: string }[] = [];
|
|
let match;
|
|
|
|
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
codeBlocks.push({
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
content: match[0],
|
|
});
|
|
}
|
|
|
|
// If no code blocks, use standard chunking
|
|
if (codeBlocks.length === 0) {
|
|
return chunkText(content, options);
|
|
}
|
|
|
|
// Split content around code blocks
|
|
const chunks: TextChunk[] = [];
|
|
let chunkIndex = 0;
|
|
let currentPosition = 0;
|
|
|
|
for (const codeBlock of codeBlocks) {
|
|
// Chunk text before code block
|
|
const textBefore = content.substring(currentPosition, codeBlock.start);
|
|
if (textBefore.trim().length > 0) {
|
|
const textChunks = chunkText(textBefore, opts);
|
|
for (const chunk of textChunks) {
|
|
chunks.push({
|
|
...chunk,
|
|
index: chunkIndex++,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Add code block as its own chunk (or split if too large)
|
|
const codeTokens = estimateTokens(codeBlock.content);
|
|
if (codeTokens <= opts.maxTokens) {
|
|
chunks.push({
|
|
index: chunkIndex++,
|
|
text: codeBlock.content,
|
|
estimatedTokens: codeTokens,
|
|
});
|
|
} else {
|
|
// Code block is too large, split by lines
|
|
const codeLines = codeBlock.content.split('\n');
|
|
let currentCodeChunk = '';
|
|
for (const line of codeLines) {
|
|
const potentialChunk = currentCodeChunk
|
|
? `${currentCodeChunk}\n${line}`
|
|
: line;
|
|
if (estimateTokens(potentialChunk) > opts.maxTokens) {
|
|
if (currentCodeChunk.length > 0) {
|
|
chunks.push({
|
|
index: chunkIndex++,
|
|
text: currentCodeChunk,
|
|
estimatedTokens: estimateTokens(currentCodeChunk),
|
|
});
|
|
}
|
|
currentCodeChunk = line;
|
|
} else {
|
|
currentCodeChunk = potentialChunk;
|
|
}
|
|
}
|
|
if (currentCodeChunk.length > 0) {
|
|
chunks.push({
|
|
index: chunkIndex++,
|
|
text: currentCodeChunk,
|
|
estimatedTokens: estimateTokens(currentCodeChunk),
|
|
});
|
|
}
|
|
}
|
|
|
|
currentPosition = codeBlock.end;
|
|
}
|
|
|
|
// Chunk remaining text after last code block
|
|
const textAfter = content.substring(currentPosition);
|
|
if (textAfter.trim().length > 0) {
|
|
const textChunks = chunkText(textAfter, opts);
|
|
for (const chunk of textChunks) {
|
|
chunks.push({
|
|
...chunk,
|
|
index: chunkIndex++,
|
|
});
|
|
}
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|