454 lines
12 KiB
TypeScript
454 lines
12 KiB
TypeScript
/**
|
|
* Server-side helpers for AlloyDB vector memory operations
|
|
*
|
|
* Handles CRUD operations on knowledge_chunks and semantic search.
|
|
*/
|
|
|
|
import { getAlloyDbClient, executeQuery, getPooledClient } from '@/lib/db/alloydb';
|
|
import type {
|
|
KnowledgeChunk,
|
|
KnowledgeChunkRow,
|
|
KnowledgeChunkSearchResult,
|
|
VectorSearchOptions,
|
|
CreateKnowledgeChunkInput,
|
|
BatchCreateKnowledgeChunksInput,
|
|
} from '@/lib/types/vector-memory';
|
|
|
|
/**
|
|
* Convert database row (snake_case) to TypeScript object (camelCase)
|
|
*/
|
|
function rowToKnowledgeChunk(row: KnowledgeChunkRow): KnowledgeChunk {
|
|
return {
|
|
id: row.id,
|
|
projectId: row.project_id,
|
|
knowledgeItemId: row.knowledge_item_id,
|
|
chunkIndex: row.chunk_index,
|
|
content: row.content,
|
|
sourceType: row.source_type,
|
|
importance: row.importance,
|
|
createdAt: row.created_at,
|
|
updatedAt: row.updated_at,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Retrieve relevant knowledge chunks using vector similarity search
|
|
*
|
|
* @param projectId - Firestore project ID to filter by
|
|
* @param queryEmbedding - Vector embedding of the query (e.g., user's question)
|
|
* @param options - Search options (limit, filters, etc.)
|
|
* @returns Array of chunks ordered by similarity (most relevant first)
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const embedding = await embedText("What's the MVP scope?");
|
|
* const chunks = await retrieveRelevantChunks('proj123', embedding, { limit: 10, minSimilarity: 0.7 });
|
|
* ```
|
|
*/
|
|
export async function retrieveRelevantChunks(
|
|
projectId: string,
|
|
queryEmbedding: number[],
|
|
options: VectorSearchOptions = {}
|
|
): Promise<KnowledgeChunkSearchResult[]> {
|
|
const {
|
|
limit = 10,
|
|
minSimilarity,
|
|
sourceTypes,
|
|
importanceLevels,
|
|
} = options;
|
|
|
|
try {
|
|
// Build the query with optional filters
|
|
let queryText = `
|
|
SELECT
|
|
id,
|
|
project_id,
|
|
knowledge_item_id,
|
|
chunk_index,
|
|
content,
|
|
source_type,
|
|
importance,
|
|
created_at,
|
|
updated_at,
|
|
1 - (embedding <=> $1::vector) AS similarity
|
|
FROM knowledge_chunks
|
|
WHERE project_id = $2
|
|
`;
|
|
|
|
const params: any[] = [JSON.stringify(queryEmbedding), projectId];
|
|
let paramIndex = 3;
|
|
|
|
// Filter by source types
|
|
if (sourceTypes && sourceTypes.length > 0) {
|
|
queryText += ` AND source_type = ANY($${paramIndex})`;
|
|
params.push(sourceTypes);
|
|
paramIndex++;
|
|
}
|
|
|
|
// Filter by importance levels
|
|
if (importanceLevels && importanceLevels.length > 0) {
|
|
queryText += ` AND importance = ANY($${paramIndex})`;
|
|
params.push(importanceLevels);
|
|
paramIndex++;
|
|
}
|
|
|
|
// Filter by minimum similarity
|
|
if (minSimilarity !== undefined) {
|
|
queryText += ` AND (1 - (embedding <=> $1::vector)) >= $${paramIndex}`;
|
|
params.push(minSimilarity);
|
|
paramIndex++;
|
|
}
|
|
|
|
// Order by similarity and limit
|
|
queryText += ` ORDER BY embedding <=> $1::vector LIMIT $${paramIndex}`;
|
|
params.push(limit);
|
|
|
|
const result = await executeQuery<KnowledgeChunkRow & { similarity: number }>(
|
|
queryText,
|
|
params
|
|
);
|
|
|
|
return result.rows.map((row) => ({
|
|
...rowToKnowledgeChunk(row),
|
|
similarity: row.similarity,
|
|
}));
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to retrieve relevant chunks:', error);
|
|
throw new Error(
|
|
`Failed to retrieve chunks: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create a single knowledge chunk
|
|
*/
|
|
export async function createKnowledgeChunk(
|
|
input: CreateKnowledgeChunkInput
|
|
): Promise<KnowledgeChunk> {
|
|
const {
|
|
projectId,
|
|
knowledgeItemId,
|
|
chunkIndex,
|
|
content,
|
|
embedding,
|
|
sourceType = null,
|
|
importance = null,
|
|
} = input;
|
|
|
|
try {
|
|
const queryText = `
|
|
INSERT INTO knowledge_chunks (
|
|
project_id,
|
|
knowledge_item_id,
|
|
chunk_index,
|
|
content,
|
|
embedding,
|
|
source_type,
|
|
importance
|
|
)
|
|
VALUES ($1, $2, $3, $4, $5::vector, $6, $7)
|
|
RETURNING
|
|
id,
|
|
project_id,
|
|
knowledge_item_id,
|
|
chunk_index,
|
|
content,
|
|
source_type,
|
|
importance,
|
|
created_at,
|
|
updated_at
|
|
`;
|
|
|
|
const result = await executeQuery<KnowledgeChunkRow>(queryText, [
|
|
projectId,
|
|
knowledgeItemId,
|
|
chunkIndex,
|
|
content,
|
|
JSON.stringify(embedding),
|
|
sourceType,
|
|
importance,
|
|
]);
|
|
|
|
if (result.rows.length === 0) {
|
|
throw new Error('Failed to insert knowledge chunk');
|
|
}
|
|
|
|
return rowToKnowledgeChunk(result.rows[0]);
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to create knowledge chunk:', error);
|
|
throw new Error(
|
|
`Failed to create chunk: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Batch create multiple knowledge chunks efficiently
|
|
*
|
|
* Uses a transaction to ensure atomicity.
|
|
*/
|
|
export async function batchCreateKnowledgeChunks(
|
|
input: BatchCreateKnowledgeChunksInput
|
|
): Promise<KnowledgeChunk[]> {
|
|
const { projectId, knowledgeItemId, chunks } = input;
|
|
|
|
if (chunks.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const client = await getPooledClient();
|
|
|
|
try {
|
|
await client.query('BEGIN');
|
|
|
|
const createdChunks: KnowledgeChunk[] = [];
|
|
|
|
for (const chunk of chunks) {
|
|
const queryText = `
|
|
INSERT INTO knowledge_chunks (
|
|
project_id,
|
|
knowledge_item_id,
|
|
chunk_index,
|
|
content,
|
|
embedding,
|
|
source_type,
|
|
importance
|
|
)
|
|
VALUES ($1, $2, $3, $4, $5::vector, $6, $7)
|
|
RETURNING
|
|
id,
|
|
project_id,
|
|
knowledge_item_id,
|
|
chunk_index,
|
|
content,
|
|
source_type,
|
|
importance,
|
|
created_at,
|
|
updated_at
|
|
`;
|
|
|
|
const result = await client.query<KnowledgeChunkRow>(queryText, [
|
|
projectId,
|
|
knowledgeItemId,
|
|
chunk.chunkIndex,
|
|
chunk.content,
|
|
JSON.stringify(chunk.embedding),
|
|
chunk.sourceType ?? null,
|
|
chunk.importance ?? null,
|
|
]);
|
|
|
|
if (result.rows.length > 0) {
|
|
createdChunks.push(rowToKnowledgeChunk(result.rows[0]));
|
|
}
|
|
}
|
|
|
|
await client.query('COMMIT');
|
|
|
|
console.log(
|
|
`[Vector Memory] Batch created ${createdChunks.length} chunks for knowledge_item ${knowledgeItemId}`
|
|
);
|
|
|
|
return createdChunks;
|
|
} catch (error) {
|
|
await client.query('ROLLBACK');
|
|
console.error('[Vector Memory] Failed to batch create chunks:', error);
|
|
throw new Error(
|
|
`Failed to batch create chunks: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
} finally {
|
|
client.release();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Delete all chunks for a specific knowledge_item
|
|
*
|
|
* Used when regenerating chunks or removing a knowledge_item.
|
|
*/
|
|
export async function deleteChunksForKnowledgeItem(
|
|
knowledgeItemId: string
|
|
): Promise<number> {
|
|
try {
|
|
const queryText = `
|
|
DELETE FROM knowledge_chunks
|
|
WHERE knowledge_item_id = $1
|
|
RETURNING id
|
|
`;
|
|
|
|
const result = await executeQuery(queryText, [knowledgeItemId]);
|
|
|
|
console.log(
|
|
`[Vector Memory] Deleted ${result.rowCount ?? 0} chunks for knowledge_item ${knowledgeItemId}`
|
|
);
|
|
|
|
return result.rowCount ?? 0;
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to delete chunks:', error);
|
|
throw new Error(
|
|
`Failed to delete chunks: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Delete all chunks for a specific project
|
|
*
|
|
* Used when cleaning up or resetting a project.
|
|
*/
|
|
export async function deleteChunksForProject(projectId: string): Promise<number> {
|
|
try {
|
|
const queryText = `
|
|
DELETE FROM knowledge_chunks
|
|
WHERE project_id = $1
|
|
RETURNING id
|
|
`;
|
|
|
|
const result = await executeQuery(queryText, [projectId]);
|
|
|
|
console.log(
|
|
`[Vector Memory] Deleted ${result.rowCount ?? 0} chunks for project ${projectId}`
|
|
);
|
|
|
|
return result.rowCount ?? 0;
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to delete project chunks:', error);
|
|
throw new Error(
|
|
`Failed to delete project chunks: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get chunk count for a knowledge_item
|
|
*/
|
|
export async function getChunkCountForKnowledgeItem(
|
|
knowledgeItemId: string
|
|
): Promise<number> {
|
|
try {
|
|
const result = await executeQuery<{ count: string }>(
|
|
'SELECT COUNT(*) as count FROM knowledge_chunks WHERE knowledge_item_id = $1',
|
|
[knowledgeItemId]
|
|
);
|
|
|
|
return parseInt(result.rows[0]?.count ?? '0', 10);
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to get chunk count:', error);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get chunk count for a project
|
|
*/
|
|
export async function getChunkCountForProject(projectId: string): Promise<number> {
|
|
try {
|
|
const result = await executeQuery<{ count: string }>(
|
|
'SELECT COUNT(*) as count FROM knowledge_chunks WHERE project_id = $1',
|
|
[projectId]
|
|
);
|
|
|
|
return parseInt(result.rows[0]?.count ?? '0', 10);
|
|
} catch (error) {
|
|
console.error('[Vector Memory] Failed to get project chunk count:', error);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Regenerate knowledge_chunks for a single knowledge_item
|
|
*
|
|
* This is the main pipeline that:
|
|
* 1. Chunks the knowledge_item.content
|
|
* 2. Generates embeddings for each chunk
|
|
* 3. Deletes existing chunks for this item
|
|
* 4. Inserts new chunks into AlloyDB
|
|
*
|
|
* @param knowledgeItem - The knowledge item to process
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const knowledgeItem = await getKnowledgeItem(projectId, itemId);
|
|
* await writeKnowledgeChunksForItem(knowledgeItem);
|
|
* ```
|
|
*/
|
|
export async function writeKnowledgeChunksForItem(
|
|
knowledgeItem: {
|
|
id: string;
|
|
projectId: string;
|
|
content: string;
|
|
sourceMeta?: { sourceType?: string; importance?: 'primary' | 'supporting' | 'irrelevant' };
|
|
}
|
|
): Promise<void> {
|
|
const { chunkText } = await import('@/lib/ai/chunking');
|
|
const { embedTextBatch } = await import('@/lib/ai/embeddings');
|
|
|
|
try {
|
|
console.log(
|
|
`[Vector Memory] Starting chunking pipeline for knowledge_item ${knowledgeItem.id}`
|
|
);
|
|
|
|
// Step 1: Chunk the content
|
|
const textChunks = chunkText(knowledgeItem.content, {
|
|
maxTokens: 800,
|
|
overlapChars: 200,
|
|
preserveParagraphs: true,
|
|
});
|
|
|
|
if (textChunks.length === 0) {
|
|
console.warn(
|
|
`[Vector Memory] No chunks generated for knowledge_item ${knowledgeItem.id} - content may be empty`
|
|
);
|
|
return;
|
|
}
|
|
|
|
console.log(
|
|
`[Vector Memory] Generated ${textChunks.length} chunks for knowledge_item ${knowledgeItem.id}`
|
|
);
|
|
|
|
// Step 2: Generate embeddings for all chunks
|
|
const chunkTexts = textChunks.map((chunk) => chunk.text);
|
|
const embeddings = await embedTextBatch(chunkTexts, {
|
|
delayMs: 50, // Small delay to avoid rate limiting
|
|
skipEmpty: true,
|
|
});
|
|
|
|
if (embeddings.length !== textChunks.length) {
|
|
throw new Error(
|
|
`Embedding count mismatch: got ${embeddings.length}, expected ${textChunks.length}`
|
|
);
|
|
}
|
|
|
|
// Step 3: Delete existing chunks for this knowledge_item
|
|
await deleteChunksForKnowledgeItem(knowledgeItem.id);
|
|
|
|
// Step 4: Insert new chunks
|
|
const chunksToInsert = textChunks.map((chunk, index) => ({
|
|
chunkIndex: chunk.index,
|
|
content: chunk.text,
|
|
embedding: embeddings[index],
|
|
sourceType: knowledgeItem.sourceMeta?.sourceType ?? null,
|
|
importance: knowledgeItem.sourceMeta?.importance ?? null,
|
|
}));
|
|
|
|
await batchCreateKnowledgeChunks({
|
|
projectId: knowledgeItem.projectId,
|
|
knowledgeItemId: knowledgeItem.id,
|
|
chunks: chunksToInsert,
|
|
});
|
|
|
|
console.log(
|
|
`[Vector Memory] Successfully processed ${chunksToInsert.length} chunks for knowledge_item ${knowledgeItem.id}`
|
|
);
|
|
} catch (error) {
|
|
console.error(
|
|
`[Vector Memory] Failed to write chunks for knowledge_item ${knowledgeItem.id}:`,
|
|
error
|
|
);
|
|
throw new Error(
|
|
`Failed to write chunks: ${error instanceof Error ? error.message : String(error)}`
|
|
);
|
|
}
|
|
}
|
|
|