diff --git a/lib/db/agent-telemetry-schema.sql b/lib/db/agent-telemetry-schema.sql new file mode 100644 index 00000000..d8254cda --- /dev/null +++ b/lib/db/agent-telemetry-schema.sql @@ -0,0 +1,32 @@ +-- ===================================================================== +-- agent_telemetry table: stores full AI turn telemetry for diagnostics +-- and as training data for fine-tuning. +-- ===================================================================== +-- +-- Each row captures one model turn: the exact context the model saw +-- (system prompt + chat history) and the exact output it produced +-- (text, thoughts, tool calls), plus token/latency metrics. +-- +-- JSONB columns let you export clean fine-tuning datasets later, e.g. +-- SELECT input_context, target_output FROM agent_telemetry; + +CREATE TABLE IF NOT EXISTS agent_telemetry ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + project_id VARCHAR(255), + model_used VARCHAR(255) NOT NULL, + system_prompt TEXT NOT NULL, + chat_history JSONB NOT NULL, + response_text TEXT, + response_thoughts TEXT, + tool_calls JSONB, + prompt_tokens INTEGER, + completion_tokens INTEGER, + total_tokens INTEGER, + duration_ms INTEGER NOT NULL +); + +-- Index for fast querying by project +CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id); +-- Index for chronological sorting +CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC); diff --git a/lib/db/knowledge-chunks-schema.sql b/lib/db/knowledge-chunks-schema.sql index 5cf552bb..12e00fb2 100644 --- a/lib/db/knowledge-chunks-schema.sql +++ b/lib/db/knowledge-chunks-schema.sql @@ -1,21 +1,117 @@ --- Add the telemetry table script alongside your existing db scripts -CREATE TABLE IF NOT EXISTS agent_telemetry ( +-- ===================================================================== +-- knowledge_chunks table: Stores chunked content with vector embeddings +-- ===================================================================== +-- +-- This table stores semantic chunks of knowledge_items for vector search. +-- Each chunk is embedded using an LLM embedding model (e.g., Gemini embeddings) +-- and stored with pgvector for efficient similarity search. +-- +-- Prerequisites: +-- 1. Enable pgvector extension: CREATE EXTENSION IF NOT EXISTS vector; +-- 2. Enable uuid generation: CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +-- + +-- Enable required extensions +CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +-- Create the knowledge_chunks table +CREATE TABLE IF NOT EXISTS knowledge_chunks ( + -- Primary key (UUID auto-generated) id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, - project_id VARCHAR(255), - model_used VARCHAR(255) NOT NULL, - system_prompt TEXT NOT NULL, - chat_history JSONB NOT NULL, - response_text TEXT, - response_thoughts TEXT, - tool_calls JSONB, - prompt_tokens INTEGER, - completion_tokens INTEGER, - total_tokens INTEGER, - duration_ms INTEGER NOT NULL + + -- References to parent entities (Firestore IDs stored as TEXT) + project_id TEXT NOT NULL, + knowledge_item_id TEXT NOT NULL, + + -- Chunk metadata + chunk_index INT NOT NULL, + content TEXT NOT NULL, + + -- Vector embedding (768 dimensions for Gemini text-embedding-004) + -- NOTE: OpenAI embeddings use 1536 dims, but Gemini uses 768 + embedding VECTOR(768) NOT NULL, + + -- Source and importance metadata (optional, from knowledge_items) + source_type TEXT, + importance TEXT CHECK (importance IN ('primary', 'supporting', 'irrelevant') OR importance IS NULL), + + -- Timestamps + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); --- Index for fast querying by project -CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id); --- Index for chronological sorting -CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC); +-- ===================================================================== +-- Indexes for efficient querying +-- ===================================================================== + +-- Standard indexes for filtering by project and knowledge_item +CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_id + ON knowledge_chunks (project_id); + +CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_knowledge_item_id + ON knowledge_chunks (knowledge_item_id); + +-- Composite index for project + knowledge_item queries +CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_knowledge + ON knowledge_chunks (project_id, knowledge_item_id); + +-- Index for chunk ordering within a knowledge_item +CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_item_index + ON knowledge_chunks (knowledge_item_id, chunk_index); + +-- Vector similarity index using IVFFlat (pgvector) +-- This enables fast approximate nearest neighbor search +-- The 'lists' parameter controls the number of clusters (tune based on data size) +-- For < 100k rows, lists=100 is reasonable. Scale up for larger datasets. +-- Using cosine distance (vector_cosine_ops) for semantic similarity +CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding + ON knowledge_chunks + USING ivfflat (embedding vector_cosine_ops) + WITH (lists = 100); + +-- Alternative: Use HNSW index for better recall at higher cost +-- Uncomment if you prefer HNSW over IVFFlat: +-- CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding_hnsw +-- ON knowledge_chunks +-- USING hnsw (embedding vector_cosine_ops) +-- WITH (m = 16, ef_construction = 64); + +-- ===================================================================== +-- Optional: Trigger to auto-update updated_at timestamp +-- ===================================================================== +CREATE OR REPLACE FUNCTION update_updated_at_column() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = NOW(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER update_knowledge_chunks_updated_at + BEFORE UPDATE ON knowledge_chunks + FOR EACH ROW + EXECUTE FUNCTION update_updated_at_column(); + +-- ===================================================================== +-- Helpful queries for monitoring and debugging +-- ===================================================================== + +-- Count chunks per project +-- SELECT project_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY project_id; + +-- Count chunks per knowledge_item +-- SELECT knowledge_item_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY knowledge_item_id; + +-- Find chunks similar to a query vector (example) +-- SELECT id, content, 1 - (embedding <=> '[0.1, 0.2, ...]') AS similarity +-- FROM knowledge_chunks +-- WHERE project_id = 'your-project-id' +-- ORDER BY embedding <=> '[0.1, 0.2, ...]' +-- LIMIT 10; + +-- Check index usage +-- SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch +-- FROM pg_stat_user_indexes +-- WHERE tablename = 'knowledge_chunks'; +