-- ===================================================================== -- knowledge_chunks table: Stores chunked content with vector embeddings -- ===================================================================== -- -- This table stores semantic chunks of knowledge_items for vector search. -- Each chunk is embedded using an LLM embedding model (e.g., Gemini embeddings) -- and stored with pgvector for efficient similarity search. -- -- Prerequisites: -- 1. Enable pgvector extension: CREATE EXTENSION IF NOT EXISTS vector; -- 2. Enable uuid generation: CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -- -- Enable required extensions CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -- Create the knowledge_chunks table CREATE TABLE IF NOT EXISTS knowledge_chunks ( -- Primary key (UUID auto-generated) id UUID PRIMARY KEY DEFAULT gen_random_uuid(), -- References to parent entities (Firestore IDs stored as TEXT) project_id TEXT NOT NULL, knowledge_item_id TEXT NOT NULL, -- Chunk metadata chunk_index INT NOT NULL, content TEXT NOT NULL, -- Vector embedding (768 dimensions for Gemini text-embedding-004) -- NOTE: OpenAI embeddings use 1536 dims, but Gemini uses 768 embedding VECTOR(768) NOT NULL, -- Source and importance metadata (optional, from knowledge_items) source_type TEXT, importance TEXT CHECK (importance IN ('primary', 'supporting', 'irrelevant') OR importance IS NULL), -- Timestamps created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- ===================================================================== -- Indexes for efficient querying -- ===================================================================== -- Standard indexes for filtering by project and knowledge_item CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_id ON knowledge_chunks (project_id); CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_knowledge_item_id ON knowledge_chunks (knowledge_item_id); -- Composite index for project + knowledge_item queries CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_knowledge ON knowledge_chunks (project_id, knowledge_item_id); -- Index for chunk ordering within a knowledge_item CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_item_index ON knowledge_chunks (knowledge_item_id, chunk_index); -- Vector similarity index using IVFFlat (pgvector) -- This enables fast approximate nearest neighbor search -- The 'lists' parameter controls the number of clusters (tune based on data size) -- For < 100k rows, lists=100 is reasonable. Scale up for larger datasets. -- Using cosine distance (vector_cosine_ops) for semantic similarity CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding ON knowledge_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); -- Alternative: Use HNSW index for better recall at higher cost -- Uncomment if you prefer HNSW over IVFFlat: -- CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding_hnsw -- ON knowledge_chunks -- USING hnsw (embedding vector_cosine_ops) -- WITH (m = 16, ef_construction = 64); -- ===================================================================== -- Optional: Trigger to auto-update updated_at timestamp -- ===================================================================== CREATE OR REPLACE FUNCTION update_updated_at_column() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = NOW(); RETURN NEW; END; $$ LANGUAGE plpgsql; CREATE TRIGGER update_knowledge_chunks_updated_at BEFORE UPDATE ON knowledge_chunks FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); -- ===================================================================== -- Helpful queries for monitoring and debugging -- ===================================================================== -- Count chunks per project -- SELECT project_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY project_id; -- Count chunks per knowledge_item -- SELECT knowledge_item_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY knowledge_item_id; -- Find chunks similar to a query vector (example) -- SELECT id, content, 1 - (embedding <=> '[0.1, 0.2, ...]') AS similarity -- FROM knowledge_chunks -- WHERE project_id = 'your-project-id' -- ORDER BY embedding <=> '[0.1, 0.2, ...]' -- LIMIT 10; -- Check index usage -- SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch -- FROM pg_stat_user_indexes -- WHERE tablename = 'knowledge_chunks';