fix(telemetry): restore knowledge_chunks schema; move agent_telemetry DDL to its own file

2026-06-10 16:23:10 -07:00
parent 4f76b0f3b7
commit caab38f950
2 changed files with 146 additions and 18 deletions
--- a/vibn-frontend/lib/db/agent-telemetry-schema.sql
+++ b/vibn-frontend/lib/db/agent-telemetry-schema.sql
@@ -0,0 +1,32 @@
+-- =====================================================================
+-- agent_telemetry table: stores full AI turn telemetry for diagnostics
+-- and as training data for fine-tuning.
+-- =====================================================================
+--
+-- Each row captures one model turn: the exact context the model saw
+-- (system prompt + chat history) and the exact output it produced
+-- (text, thoughts, tool calls), plus token/latency metrics.
+--
+-- JSONB columns let you export clean fine-tuning datasets later, e.g.
+--   SELECT input_context, target_output FROM agent_telemetry;
+
+CREATE TABLE IF NOT EXISTS agent_telemetry (
+  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+  created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+  project_id VARCHAR(255),
+  model_used VARCHAR(255) NOT NULL,
+  system_prompt TEXT NOT NULL,
+  chat_history JSONB NOT NULL,
+  response_text TEXT,
+  response_thoughts TEXT,
+  tool_calls JSONB,
+  prompt_tokens INTEGER,
+  completion_tokens INTEGER,
+  total_tokens INTEGER,
+  duration_ms INTEGER NOT NULL
+);
+
+-- Index for fast querying by project
+CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id);
+-- Index for chronological sorting
+CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC);
--- a/vibn-frontend/lib/db/knowledge-chunks-schema.sql
+++ b/vibn-frontend/lib/db/knowledge-chunks-schema.sql
@@ -1,21 +1,117 @@
-- Add the telemetry table script alongside your existing db scripts
-CREATE TABLE IF NOT EXISTS agent_telemetry (
+-- =====================================================================
+-- knowledge_chunks table: Stores chunked content with vector embeddings
+-- =====================================================================
+-- 
+-- This table stores semantic chunks of knowledge_items for vector search.
+-- Each chunk is embedded using an LLM embedding model (e.g., Gemini embeddings)
+-- and stored with pgvector for efficient similarity search.
+--
+-- Prerequisites:
+-- 1. Enable pgvector extension: CREATE EXTENSION IF NOT EXISTS vector;
+-- 2. Enable uuid generation: CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+--
+
+-- Enable required extensions
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+
+-- Create the knowledge_chunks table
+CREATE TABLE IF NOT EXISTS knowledge_chunks (
+  -- Primary key (UUID auto-generated)
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-  created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
-  project_id VARCHAR(255),
-  model_used VARCHAR(255) NOT NULL,
-  system_prompt TEXT NOT NULL,
-  chat_history JSONB NOT NULL,
-  response_text TEXT,
-  response_thoughts TEXT,
-  tool_calls JSONB,
-  prompt_tokens INTEGER,
-  completion_tokens INTEGER,
-  total_tokens INTEGER,
-  duration_ms INTEGER NOT NULL
+  
+  -- References to parent entities (Firestore IDs stored as TEXT)
+  project_id TEXT NOT NULL,
+  knowledge_item_id TEXT NOT NULL,
+  
+  -- Chunk metadata
+  chunk_index INT NOT NULL,
+  content TEXT NOT NULL,
+  
+  -- Vector embedding (768 dimensions for Gemini text-embedding-004)
+  -- NOTE: OpenAI embeddings use 1536 dims, but Gemini uses 768
+  embedding VECTOR(768) NOT NULL,
+  
+  -- Source and importance metadata (optional, from knowledge_items)
+  source_type TEXT,
+  importance TEXT CHECK (importance IN ('primary', 'supporting', 'irrelevant') OR importance IS NULL),
+  
+  -- Timestamps
+  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+  updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );

-- Index for fast querying by project
-CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id);
-- Index for chronological sorting
-CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC);
+-- =====================================================================
+-- Indexes for efficient querying
+-- =====================================================================
+
+-- Standard indexes for filtering by project and knowledge_item
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_id 
+  ON knowledge_chunks (project_id);
+
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_knowledge_item_id 
+  ON knowledge_chunks (knowledge_item_id);
+
+-- Composite index for project + knowledge_item queries
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_knowledge 
+  ON knowledge_chunks (project_id, knowledge_item_id);
+
+-- Index for chunk ordering within a knowledge_item
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_item_index 
+  ON knowledge_chunks (knowledge_item_id, chunk_index);
+
+-- Vector similarity index using IVFFlat (pgvector)
+-- This enables fast approximate nearest neighbor search
+-- The 'lists' parameter controls the number of clusters (tune based on data size)
+-- For < 100k rows, lists=100 is reasonable. Scale up for larger datasets.
+-- Using cosine distance (vector_cosine_ops) for semantic similarity
+CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding 
+  ON knowledge_chunks 
+  USING ivfflat (embedding vector_cosine_ops) 
+  WITH (lists = 100);
+
+-- Alternative: Use HNSW index for better recall at higher cost
+-- Uncomment if you prefer HNSW over IVFFlat:
+-- CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding_hnsw 
+--   ON knowledge_chunks 
+--   USING hnsw (embedding vector_cosine_ops) 
+--   WITH (m = 16, ef_construction = 64);
+
+-- =====================================================================
+-- Optional: Trigger to auto-update updated_at timestamp
+-- =====================================================================
+CREATE OR REPLACE FUNCTION update_updated_at_column()
+RETURNS TRIGGER AS $$
+BEGIN
+  NEW.updated_at = NOW();
+  RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER update_knowledge_chunks_updated_at
+  BEFORE UPDATE ON knowledge_chunks
+  FOR EACH ROW
+  EXECUTE FUNCTION update_updated_at_column();
+
+-- =====================================================================
+-- Helpful queries for monitoring and debugging
+-- =====================================================================
+
+-- Count chunks per project
+-- SELECT project_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY project_id;
+
+-- Count chunks per knowledge_item
+-- SELECT knowledge_item_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY knowledge_item_id;
+
+-- Find chunks similar to a query vector (example)
+-- SELECT id, content, 1 - (embedding <=> '[0.1, 0.2, ...]') AS similarity
+-- FROM knowledge_chunks
+-- WHERE project_id = 'your-project-id'
+-- ORDER BY embedding <=> '[0.1, 0.2, ...]'
+-- LIMIT 10;
+
+-- Check index usage
+-- SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
+-- FROM pg_stat_user_indexes
+-- WHERE tablename = 'knowledge_chunks';
+