VIBN Frontend for Coolify deployment

This commit is contained in:
2026-02-15 19:25:52 -08:00
commit 40bf8428cd
398 changed files with 76513 additions and 0 deletions

297
lib/ai/chunking.ts Normal file
View File

@@ -0,0 +1,297 @@
/**
* Text chunking for semantic search
*
* Splits large documents into smaller, semantically coherent chunks
* suitable for vector embedding and retrieval.
*/
export interface TextChunk {
/** Index of this chunk (0-based) */
index: number;
/** The chunked text content */
text: string;
/** Approximate token count (for reference) */
estimatedTokens: number;
}
export interface ChunkingOptions {
/** Target maximum tokens per chunk (approximate) */
maxTokens?: number;
/** Target maximum characters per chunk (fallback if no tokenizer) */
maxChars?: number;
/** Overlap between chunks (in characters) */
overlapChars?: number;
/** Whether to try preserving paragraph boundaries */
preserveParagraphs?: boolean;
}
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
maxTokens: 800,
maxChars: 3000, // Rough approximation: ~4 chars per token
overlapChars: 200,
preserveParagraphs: true,
};
/**
* Estimate token count from character count
*
* Uses a rough heuristic: 1 token ≈ 4 characters for English text.
* For more accuracy, integrate a real tokenizer (e.g., tiktoken).
*/
function estimateTokens(text: string): number {
return Math.ceil(text.length / 4);
}
/**
* Split text into paragraphs, preserving empty lines as separators
*/
function splitIntoParagraphs(text: string): string[] {
return text.split(/\n\n+/).filter((p) => p.trim().length > 0);
}
/**
* Split text into sentences (simple heuristic)
*/
function splitIntoSentences(text: string): string[] {
// Simple sentence boundary detection
return text
.split(/[.!?]+\s+/)
.map((s) => s.trim())
.filter((s) => s.length > 0);
}
/**
* Chunk text into semantic pieces suitable for embedding
*
* Strategy:
* 1. Split by paragraphs (if preserveParagraphs = true)
* 2. Group paragraphs/sentences until reaching maxTokens/maxChars
* 3. Add overlap between chunks for context continuity
*
* @param content - Text to chunk
* @param options - Chunking options
* @returns Array of text chunks with metadata
*
* @example
* ```typescript
* const chunks = chunkText(longDocument, { maxTokens: 500, overlapChars: 100 });
* for (const chunk of chunks) {
* console.log(`Chunk ${chunk.index}: ${chunk.estimatedTokens} tokens`);
* await embedText(chunk.text);
* }
* ```
*/
export function chunkText(
content: string,
options: ChunkingOptions = {}
): TextChunk[] {
const opts = { ...DEFAULT_OPTIONS, ...options };
const chunks: TextChunk[] = [];
if (!content || content.trim().length === 0) {
return chunks;
}
// Clean up content
const cleanedContent = content.trim();
// If content is small enough, return as single chunk
if (estimateTokens(cleanedContent) <= opts.maxTokens) {
return [
{
index: 0,
text: cleanedContent,
estimatedTokens: estimateTokens(cleanedContent),
},
];
}
// Split into paragraphs or sentences
const units = opts.preserveParagraphs
? splitIntoParagraphs(cleanedContent)
: splitIntoSentences(cleanedContent);
if (units.length === 0) {
return [
{
index: 0,
text: cleanedContent,
estimatedTokens: estimateTokens(cleanedContent),
},
];
}
let currentChunk = '';
let chunkIndex = 0;
let previousOverlap = '';
for (let i = 0; i < units.length; i++) {
const unit = units[i];
const potentialChunk = currentChunk
? `${currentChunk}\n\n${unit}`
: `${previousOverlap}${unit}`;
const potentialTokens = estimateTokens(potentialChunk);
const potentialChars = potentialChunk.length;
// Check if adding this unit would exceed limits
if (
potentialTokens > opts.maxTokens ||
potentialChars > opts.maxChars
) {
// Save current chunk if it has content
if (currentChunk.length > 0) {
chunks.push({
index: chunkIndex++,
text: currentChunk,
estimatedTokens: estimateTokens(currentChunk),
});
// Prepare overlap for next chunk
const overlapStart = Math.max(
0,
currentChunk.length - opts.overlapChars
);
previousOverlap = currentChunk.substring(overlapStart);
if (previousOverlap.length > 0 && !previousOverlap.endsWith(' ')) {
// Try to start overlap at a word boundary
const spaceIndex = previousOverlap.indexOf(' ');
if (spaceIndex > 0) {
previousOverlap = previousOverlap.substring(spaceIndex + 1);
}
}
}
// Start new chunk with current unit
currentChunk = `${previousOverlap}${unit}`;
} else {
// Add unit to current chunk
currentChunk = potentialChunk;
}
}
// Add final chunk if it has content
if (currentChunk.length > 0) {
chunks.push({
index: chunkIndex++,
text: currentChunk,
estimatedTokens: estimateTokens(currentChunk),
});
}
console.log(
`[Chunking] Split ${cleanedContent.length} chars into ${chunks.length} chunks`
);
return chunks;
}
/**
* Chunk text with code-aware splitting
*
* Preserves code blocks and tries to keep them intact.
* Useful for chunking AI chat transcripts that contain code snippets.
*/
export function chunkTextWithCodeAwareness(
content: string,
options: ChunkingOptions = {}
): TextChunk[] {
const opts = { ...DEFAULT_OPTIONS, ...options };
// Detect code blocks (triple backticks)
const codeBlockRegex = /```[\s\S]*?```/g;
const codeBlocks: { start: number; end: number; content: string }[] = [];
let match;
while ((match = codeBlockRegex.exec(content)) !== null) {
codeBlocks.push({
start: match.index,
end: match.index + match[0].length,
content: match[0],
});
}
// If no code blocks, use standard chunking
if (codeBlocks.length === 0) {
return chunkText(content, options);
}
// Split content around code blocks
const chunks: TextChunk[] = [];
let chunkIndex = 0;
let currentPosition = 0;
for (const codeBlock of codeBlocks) {
// Chunk text before code block
const textBefore = content.substring(currentPosition, codeBlock.start);
if (textBefore.trim().length > 0) {
const textChunks = chunkText(textBefore, opts);
for (const chunk of textChunks) {
chunks.push({
...chunk,
index: chunkIndex++,
});
}
}
// Add code block as its own chunk (or split if too large)
const codeTokens = estimateTokens(codeBlock.content);
if (codeTokens <= opts.maxTokens) {
chunks.push({
index: chunkIndex++,
text: codeBlock.content,
estimatedTokens: codeTokens,
});
} else {
// Code block is too large, split by lines
const codeLines = codeBlock.content.split('\n');
let currentCodeChunk = '';
for (const line of codeLines) {
const potentialChunk = currentCodeChunk
? `${currentCodeChunk}\n${line}`
: line;
if (estimateTokens(potentialChunk) > opts.maxTokens) {
if (currentCodeChunk.length > 0) {
chunks.push({
index: chunkIndex++,
text: currentCodeChunk,
estimatedTokens: estimateTokens(currentCodeChunk),
});
}
currentCodeChunk = line;
} else {
currentCodeChunk = potentialChunk;
}
}
if (currentCodeChunk.length > 0) {
chunks.push({
index: chunkIndex++,
text: currentCodeChunk,
estimatedTokens: estimateTokens(currentCodeChunk),
});
}
}
currentPosition = codeBlock.end;
}
// Chunk remaining text after last code block
const textAfter = content.substring(currentPosition);
if (textAfter.trim().length > 0) {
const textChunks = chunkText(textAfter, opts);
for (const chunk of textChunks) {
chunks.push({
...chunk,
index: chunkIndex++,
});
}
}
return chunks;
}