fix(telemetry): restore knowledge_chunks schema; move agent_telemetry DDL to its own file

This commit is contained in:
2026-06-10 16:23:10 -07:00
parent 4f76b0f3b7
commit caab38f950
2 changed files with 146 additions and 18 deletions

View File

@@ -0,0 +1,32 @@
-- =====================================================================
-- agent_telemetry table: stores full AI turn telemetry for diagnostics
-- and as training data for fine-tuning.
-- =====================================================================
--
-- Each row captures one model turn: the exact context the model saw
-- (system prompt + chat history) and the exact output it produced
-- (text, thoughts, tool calls), plus token/latency metrics.
--
-- JSONB columns let you export clean fine-tuning datasets later, e.g.
-- SELECT input_context, target_output FROM agent_telemetry;
CREATE TABLE IF NOT EXISTS agent_telemetry (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
project_id VARCHAR(255),
model_used VARCHAR(255) NOT NULL,
system_prompt TEXT NOT NULL,
chat_history JSONB NOT NULL,
response_text TEXT,
response_thoughts TEXT,
tool_calls JSONB,
prompt_tokens INTEGER,
completion_tokens INTEGER,
total_tokens INTEGER,
duration_ms INTEGER NOT NULL
);
-- Index for fast querying by project
CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id);
-- Index for chronological sorting
CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC);

View File

@@ -1,21 +1,117 @@
-- Add the telemetry table script alongside your existing db scripts
CREATE TABLE IF NOT EXISTS agent_telemetry (
-- =====================================================================
-- knowledge_chunks table: Stores chunked content with vector embeddings
-- =====================================================================
--
-- This table stores semantic chunks of knowledge_items for vector search.
-- Each chunk is embedded using an LLM embedding model (e.g., Gemini embeddings)
-- and stored with pgvector for efficient similarity search.
--
-- Prerequisites:
-- 1. Enable pgvector extension: CREATE EXTENSION IF NOT EXISTS vector;
-- 2. Enable uuid generation: CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
--
-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- Create the knowledge_chunks table
CREATE TABLE IF NOT EXISTS knowledge_chunks (
-- Primary key (UUID auto-generated)
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
project_id VARCHAR(255),
model_used VARCHAR(255) NOT NULL,
system_prompt TEXT NOT NULL,
chat_history JSONB NOT NULL,
response_text TEXT,
response_thoughts TEXT,
tool_calls JSONB,
prompt_tokens INTEGER,
completion_tokens INTEGER,
total_tokens INTEGER,
duration_ms INTEGER NOT NULL
-- References to parent entities (Firestore IDs stored as TEXT)
project_id TEXT NOT NULL,
knowledge_item_id TEXT NOT NULL,
-- Chunk metadata
chunk_index INT NOT NULL,
content TEXT NOT NULL,
-- Vector embedding (768 dimensions for Gemini text-embedding-004)
-- NOTE: OpenAI embeddings use 1536 dims, but Gemini uses 768
embedding VECTOR(768) NOT NULL,
-- Source and importance metadata (optional, from knowledge_items)
source_type TEXT,
importance TEXT CHECK (importance IN ('primary', 'supporting', 'irrelevant') OR importance IS NULL),
-- Timestamps
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Index for fast querying by project
CREATE INDEX IF NOT EXISTS idx_agent_telemetry_project ON agent_telemetry(project_id);
-- Index for chronological sorting
CREATE INDEX IF NOT EXISTS idx_agent_telemetry_created_at ON agent_telemetry(created_at DESC);
-- =====================================================================
-- Indexes for efficient querying
-- =====================================================================
-- Standard indexes for filtering by project and knowledge_item
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_id
ON knowledge_chunks (project_id);
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_knowledge_item_id
ON knowledge_chunks (knowledge_item_id);
-- Composite index for project + knowledge_item queries
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_project_knowledge
ON knowledge_chunks (project_id, knowledge_item_id);
-- Index for chunk ordering within a knowledge_item
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_item_index
ON knowledge_chunks (knowledge_item_id, chunk_index);
-- Vector similarity index using IVFFlat (pgvector)
-- This enables fast approximate nearest neighbor search
-- The 'lists' parameter controls the number of clusters (tune based on data size)
-- For < 100k rows, lists=100 is reasonable. Scale up for larger datasets.
-- Using cosine distance (vector_cosine_ops) for semantic similarity
CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding
ON knowledge_chunks
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
-- Alternative: Use HNSW index for better recall at higher cost
-- Uncomment if you prefer HNSW over IVFFlat:
-- CREATE INDEX IF NOT EXISTS idx_knowledge_chunks_embedding_hnsw
-- ON knowledge_chunks
-- USING hnsw (embedding vector_cosine_ops)
-- WITH (m = 16, ef_construction = 64);
-- =====================================================================
-- Optional: Trigger to auto-update updated_at timestamp
-- =====================================================================
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER update_knowledge_chunks_updated_at
BEFORE UPDATE ON knowledge_chunks
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- =====================================================================
-- Helpful queries for monitoring and debugging
-- =====================================================================
-- Count chunks per project
-- SELECT project_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY project_id;
-- Count chunks per knowledge_item
-- SELECT knowledge_item_id, COUNT(*) as chunk_count FROM knowledge_chunks GROUP BY knowledge_item_id;
-- Find chunks similar to a query vector (example)
-- SELECT id, content, 1 - (embedding <=> '[0.1, 0.2, ...]') AS similarity
-- FROM knowledge_chunks
-- WHERE project_id = 'your-project-id'
-- ORDER BY embedding <=> '[0.1, 0.2, ...]'
-- LIMIT 10;
-- Check index usage
-- SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
-- FROM pg_stat_user_indexes
-- WHERE tablename = 'knowledge_chunks';