118 lines
4.4 KiB
PL/PgSQL
118 lines
4.4 KiB
PL/PgSQL
-- =====================================================================
|
|
-- knowledge_chunks table: Stores chunked content with vector embeddings
|
|
-- =====================================================================
|
|
--
|
|
-- This table stores semantic chunks of knowledge_items for vector search.
|
|
-- Each chunk is embedded using an LLM embedding model (e.g., Gemini embeddings)
|
|
-- and stored with pgvector for efficient similarity search.
|
|
--
|
|
-- Prerequisites:
|
|
-- 1. Enable pgvector extension: CREATE EXTENSION IF NOT EXISTS vector;
|
|
-- 2. Enable uuid generation: CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
|
--
|
|
|
|
-- Enable required extensions
|
|
CREATE EXTENSION IF NOT EXISTS vector;
|
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
|
|
|
-- Create the knowledge_chunks table
|
|
CREATE TABLE IF NOT EXISTS knowledge_chunks (
|
|
-- Primary key (UUID auto-generated)
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
|
|
-- References to parent entities (Firestore IDs stored as TEXT)
|
|
project_id TEXT NOT NULL,
|
|
knowledge_item_id TEXT NOT NULL,
|
|
|
|
-- Chunk metadata
|
|
chunk_index INT NOT NULL,
|
|
content TEXT NOT NULL,
|
|
|
|
-- Vector embedding (768 dimensions for Gemini text-embedding-004)
|
|
-- NOTE: OpenAI embeddings use 1536 dims, but Gemini uses 768
|
|
embedding VECTOR(768) NOT NULL,
|
|
|
|
-- Source and importance metadata (optional, from knowledge_items)
|
|
source_type TEXT,
|
|
importance TEXT CHECK (importance IN ('primary', 'supporting', 'irrelevant') OR importance IS NULL),
|
|
|
|
-- Timestamps
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- =====================================================================
|
|
-- Indexes for efficient querying
|
|
-- =====================================================================
|
|
|
|
-- Standard indexes for filtering by project and knowledge_item
|
|
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_id
|
|
ON knowledge_chunks (project_id);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_knowledge_item_id
|
|
ON knowledge_chunks (knowledge_item_id);
|
|
|
|
-- Composite index for project + knowledge_item queries
|
|
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_knowledge
|
|
ON knowledge_chunks (project_id, knowledge_item_id);
|
|
|
|
-- Index for chunk ordering within a knowledge_item
|
|
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_item_index
|
|
ON knowledge_chunks (knowledge_item_id, chunk_index);
|
|
|
|
-- Vector similarity index using IVFFlat (pgvector)
|
|
-- This enables fast approximate nearest neighbor search
|
|
-- The 'lists' parameter controls the number of clusters (tune based on data size)
|
|
-- For < 100k rows, lists=100 is reasonable. Scale up for larger datasets.
|
|
-- Using cosine distance (vector_cosine_ops) for semantic similarity
|
|
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding
|
|
ON knowledge_chunks
|
|
USING ivfflat (embedding vector_cosine_ops)
|
|
WITH (lists = 100);
|
|
|
|
-- Alternative: Use HNSW index for better recall at higher cost
|
|
-- Uncomment if you prefer HNSW over IVFFlat:
|
|
-- CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding_hnsw
|
|
-- ON knowledge_chunks
|
|
-- USING hnsw (embedding vector_cosine_ops)
|
|
-- WITH (m = 16, ef_construction = 64);
|
|
|
|
-- =====================================================================
|
|
-- Optional: Trigger to auto-update updated_at timestamp
|
|
-- =====================================================================
|
|
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
NEW.updated_at = NOW();
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER update_knowledge_chunks_updated_at
|
|
BEFORE UPDATE ON knowledge_chunks
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_updated_at_column();
|
|
|
|
-- =====================================================================
|
|
-- Helpful queries for monitoring and debugging
|
|
-- =====================================================================
|
|
|
|
-- Count chunks per project
|
|
-- SELECT project_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY project_id;
|
|
|
|
-- Count chunks per knowledge_item
|
|
-- SELECT knowledge_item_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY knowledge_item_id;
|
|
|
|
-- Find chunks similar to a query vector (example)
|
|
-- SELECT id, content, 1 - (embedding <=> '[0.1, 0.2, ...]') AS similarity
|
|
-- FROM knowledge_chunks
|
|
-- WHERE project_id = 'your-project-id'
|
|
-- ORDER BY embedding <=> '[0.1, 0.2, ...]'
|
|
-- LIMIT 10;
|
|
|
|
-- Check index usage
|
|
-- SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
|
|
-- FROM pg_stat_user_indexes
|
|
-- WHERE tablename = 'knowledge_chunks';
|
|
|