VIBN Frontend for Coolify deployment
This commit is contained in:
297
lib/ai/chunking.ts
Normal file
297
lib/ai/chunking.ts
Normal file
@@ -0,0 +1,297 @@
|
||||
/**
|
||||
* Text chunking for semantic search
|
||||
*
|
||||
* Splits large documents into smaller, semantically coherent chunks
|
||||
* suitable for vector embedding and retrieval.
|
||||
*/
|
||||
|
||||
export interface TextChunk {
|
||||
/** Index of this chunk (0-based) */
|
||||
index: number;
|
||||
|
||||
/** The chunked text content */
|
||||
text: string;
|
||||
|
||||
/** Approximate token count (for reference) */
|
||||
estimatedTokens: number;
|
||||
}
|
||||
|
||||
export interface ChunkingOptions {
|
||||
/** Target maximum tokens per chunk (approximate) */
|
||||
maxTokens?: number;
|
||||
|
||||
/** Target maximum characters per chunk (fallback if no tokenizer) */
|
||||
maxChars?: number;
|
||||
|
||||
/** Overlap between chunks (in characters) */
|
||||
overlapChars?: number;
|
||||
|
||||
/** Whether to try preserving paragraph boundaries */
|
||||
preserveParagraphs?: boolean;
|
||||
}
|
||||
|
||||
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
|
||||
maxTokens: 800,
|
||||
maxChars: 3000, // Rough approximation: ~4 chars per token
|
||||
overlapChars: 200,
|
||||
preserveParagraphs: true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Estimate token count from character count
|
||||
*
|
||||
* Uses a rough heuristic: 1 token ≈ 4 characters for English text.
|
||||
* For more accuracy, integrate a real tokenizer (e.g., tiktoken).
|
||||
*/
|
||||
function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text into paragraphs, preserving empty lines as separators
|
||||
*/
|
||||
function splitIntoParagraphs(text: string): string[] {
|
||||
return text.split(/\n\n+/).filter((p) => p.trim().length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text into sentences (simple heuristic)
|
||||
*/
|
||||
function splitIntoSentences(text: string): string[] {
|
||||
// Simple sentence boundary detection
|
||||
return text
|
||||
.split(/[.!?]+\s+/)
|
||||
.map((s) => s.trim())
|
||||
.filter((s) => s.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk text into semantic pieces suitable for embedding
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Split by paragraphs (if preserveParagraphs = true)
|
||||
* 2. Group paragraphs/sentences until reaching maxTokens/maxChars
|
||||
* 3. Add overlap between chunks for context continuity
|
||||
*
|
||||
* @param content - Text to chunk
|
||||
* @param options - Chunking options
|
||||
* @returns Array of text chunks with metadata
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 });
|
||||
* for (const chunk of chunks) {
|
||||
* console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`);
|
||||
* await embedText(chunk.text);
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export function chunkText(
|
||||
content: string,
|
||||
options: ChunkingOptions = {}
|
||||
): TextChunk[] {
|
||||
const opts = { ...DEFAULT_OPTIONS, ...options };
|
||||
const chunks: TextChunk[] = [];
|
||||
|
||||
if (!content || content.trim().length === 0) {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Clean up content
|
||||
const cleanedContent = content.trim();
|
||||
|
||||
// If content is small enough, return as single chunk
|
||||
if (estimateTokens(cleanedContent) <= opts.maxTokens) {
|
||||
return [
|
||||
{
|
||||
index: 0,
|
||||
text: cleanedContent,
|
||||
estimatedTokens: estimateTokens(cleanedContent),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// Split into paragraphs or sentences
|
||||
const units = opts.preserveParagraphs
|
||||
? splitIntoParagraphs(cleanedContent)
|
||||
: splitIntoSentences(cleanedContent);
|
||||
|
||||
if (units.length === 0) {
|
||||
return [
|
||||
{
|
||||
index: 0,
|
||||
text: cleanedContent,
|
||||
estimatedTokens: estimateTokens(cleanedContent),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
let currentChunk = '';
|
||||
let chunkIndex = 0;
|
||||
let previousOverlap = '';
|
||||
|
||||
for (let i = 0; i < units.length; i++) {
|
||||
const unit = units[i];
|
||||
const potentialChunk = currentChunk
|
||||
? `${currentChunk}\n\n${unit}`
|
||||
: `${previousOverlap}${unit}`;
|
||||
|
||||
const potentialTokens = estimateTokens(potentialChunk);
|
||||
const potentialChars = potentialChunk.length;
|
||||
|
||||
// Check if adding this unit would exceed limits
|
||||
if (
|
||||
potentialTokens > opts.maxTokens ||
|
||||
potentialChars > opts.maxChars
|
||||
) {
|
||||
// Save current chunk if it has content
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
index: chunkIndex++,
|
||||
text: currentChunk,
|
||||
estimatedTokens: estimateTokens(currentChunk),
|
||||
});
|
||||
|
||||
// Prepare overlap for next chunk
|
||||
const overlapStart = Math.max(
|
||||
0,
|
||||
currentChunk.length - opts.overlapChars
|
||||
);
|
||||
previousOverlap = currentChunk.substring(overlapStart);
|
||||
if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) {
|
||||
// Try to start overlap at a word boundary
|
||||
const spaceIndex = previousOverlap.indexOf(' ');
|
||||
if (spaceIndex > 0) {
|
||||
previousOverlap = previousOverlap.substring(spaceIndex + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start new chunk with current unit
|
||||
currentChunk = `${previousOverlap}${unit}`;
|
||||
} else {
|
||||
// Add unit to current chunk
|
||||
currentChunk = potentialChunk;
|
||||
}
|
||||
}
|
||||
|
||||
// Add final chunk if it has content
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
index: chunkIndex++,
|
||||
text: currentChunk,
|
||||
estimatedTokens: estimateTokens(currentChunk),
|
||||
});
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks`
|
||||
);
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk text with code-aware splitting
|
||||
*
|
||||
* Preserves code blocks and tries to keep them intact.
|
||||
* Useful for chunking AI chat transcripts that contain code snippets.
|
||||
*/
|
||||
export function chunkTextWithCodeAwareness(
|
||||
content: string,
|
||||
options: ChunkingOptions = {}
|
||||
): TextChunk[] {
|
||||
const opts = { ...DEFAULT_OPTIONS, ...options };
|
||||
|
||||
// Detect code blocks (triple backticks)
|
||||
const codeBlockRegex = /```[\s\S]*?```/g;
|
||||
const codeBlocks: { start: number; end: number; content: string }[] = [];
|
||||
let match;
|
||||
|
||||
while ((match = codeBlockRegex.exec(content)) !== null) {
|
||||
codeBlocks.push({
|
||||
start: match.index,
|
||||
end: match.index + match[0].length,
|
||||
content: match[0],
|
||||
});
|
||||
}
|
||||
|
||||
// If no code blocks, use standard chunking
|
||||
if (codeBlocks.length === 0) {
|
||||
return chunkText(content, options);
|
||||
}
|
||||
|
||||
// Split content around code blocks
|
||||
const chunks: TextChunk[] = [];
|
||||
let chunkIndex = 0;
|
||||
let currentPosition = 0;
|
||||
|
||||
for (const codeBlock of codeBlocks) {
|
||||
// Chunk text before code block
|
||||
const textBefore = content.substring(currentPosition, codeBlock.start);
|
||||
if (textBefore.trim().length > 0) {
|
||||
const textChunks = chunkText(textBefore, opts);
|
||||
for (const chunk of textChunks) {
|
||||
chunks.push({
|
||||
...chunk,
|
||||
index: chunkIndex++,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add code block as its own chunk (or split if too large)
|
||||
const codeTokens = estimateTokens(codeBlock.content);
|
||||
if (codeTokens <= opts.maxTokens) {
|
||||
chunks.push({
|
||||
index: chunkIndex++,
|
||||
text: codeBlock.content,
|
||||
estimatedTokens: codeTokens,
|
||||
});
|
||||
} else {
|
||||
// Code block is too large, split by lines
|
||||
const codeLines = codeBlock.content.split('\n');
|
||||
let currentCodeChunk = '';
|
||||
for (const line of codeLines) {
|
||||
const potentialChunk = currentCodeChunk
|
||||
? `${currentCodeChunk}\n${line}`
|
||||
: line;
|
||||
if (estimateTokens(potentialChunk) > opts.maxTokens) {
|
||||
if (currentCodeChunk.length > 0) {
|
||||
chunks.push({
|
||||
index: chunkIndex++,
|
||||
text: currentCodeChunk,
|
||||
estimatedTokens: estimateTokens(currentCodeChunk),
|
||||
});
|
||||
}
|
||||
currentCodeChunk = line;
|
||||
} else {
|
||||
currentCodeChunk = potentialChunk;
|
||||
}
|
||||
}
|
||||
if (currentCodeChunk.length > 0) {
|
||||
chunks.push({
|
||||
index: chunkIndex++,
|
||||
text: currentCodeChunk,
|
||||
estimatedTokens: estimateTokens(currentCodeChunk),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
currentPosition = codeBlock.end;
|
||||
}
|
||||
|
||||
// Chunk remaining text after last code block
|
||||
const textAfter = content.substring(currentPosition);
|
||||
if (textAfter.trim().length > 0) {
|
||||
const textChunks = chunkText(textAfter, opts);
|
||||
for (const chunk of textChunks) {
|
||||
chunks.push({
|
||||
...chunk,
|
||||
index: chunkIndex++,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user