Files
vibn-frontend/lib/server/backend-extractor.ts

229 lines
9.0 KiB
TypeScript

/**
* Backend Extraction Module
*
* Runs extraction as a pure backend job, not in chat.
* Called when Collector phase completes.
*/
import { getAdminDb } from '@/lib/firebase/admin';
import { GeminiLlmClient } from '@/lib/ai/gemini-client';
import { BACKEND_EXTRACTOR_SYSTEM_PROMPT } from '@/lib/ai/prompts/extractor';
import { writeKnowledgeChunksForItem } from '@/lib/server/vector-memory';
import type { ExtractionOutput, ExtractedInsight } from '@/lib/types/extraction-output';
import type { PhaseHandoff } from '@/lib/types/phase-handoff';
import { z } from 'zod';
const ExtractionOutputSchema = z.object({
insights: z.array(z.object({
id: z.string(),
type: z.enum(["problem", "user", "feature", "constraint", "opportunity", "other"]),
title: z.string(),
description: z.string(),
sourceText: z.string(),
sourceKnowledgeItemId: z.string(),
importance: z.enum(["primary", "supporting"]),
confidence: z.number().min(0).max(1),
})),
problems: z.array(z.string()),
targetUsers: z.array(z.string()),
features: z.array(z.string()),
constraints: z.array(z.string()),
opportunities: z.array(z.string()),
uncertainties: z.array(z.string()),
missingInformation: z.array(z.string()),
overallConfidence: z.number().min(0).max(1),
});
export async function runBackendExtractionForProject(projectId: string): Promise<void> {
console.log(`[Backend Extractor] Starting extraction for project ${projectId}`);
const adminDb = getAdminDb();
try {
// 1. Load project
const projectDoc = await adminDb.collection('projects').doc(projectId).get();
if (!projectDoc.exists) {
throw new Error(`Project ${projectId} not found`);
}
const projectData = projectDoc.data();
// 2. Load knowledge items
const knowledgeSnapshot = await adminDb
.collection('knowledge_items')
.where('projectId', '==', projectId)
.where('sourceType', '==', 'imported_document')
.get();
if (knowledgeSnapshot.empty) {
console.log(`[Backend Extractor] No documents to extract for project ${projectId} - creating empty handoff`);
// Create a minimal extraction handoff even with no documents
const emptyHandoff: PhaseHandoff = {
phase: 'extraction',
readyForNextPhase: false, // Not ready - no materials to extract from
confidence: 0,
confirmed: {
problems: [],
targetUsers: [],
features: [],
constraints: [],
opportunities: [],
},
uncertain: {},
missing: ['No documents uploaded - need product requirements, specs, or notes'],
questionsForUser: [
'You haven\'t uploaded any documents yet. Do you have any product specs, requirements, or notes to share?',
],
sourceEvidence: [],
version: 'extraction_v1',
timestamp: new Date().toISOString(),
};
await adminDb.collection('projects').doc(projectId).update({
'phaseData.phaseHandoffs.extraction': emptyHandoff,
currentPhase: 'extraction_review',
phaseStatus: 'in_progress',
'phaseData.extractionCompletedAt': new Date().toISOString(),
updatedAt: new Date().toISOString(),
});
console.log(`[Backend Extractor] Set phase to extraction_review with empty handoff`);
return;
}
console.log(`[Backend Extractor] Found ${knowledgeSnapshot.size} documents to process`);
const llm = new GeminiLlmClient();
const allExtractionOutputs: ExtractionOutput[] = [];
const processedKnowledgeItemIds: string[] = [];
// 3. Process each document
for (const knowledgeDoc of knowledgeSnapshot.docs) {
const knowledgeData = knowledgeDoc.data();
const knowledgeItemId = knowledgeDoc.id;
try {
console.log(`[Backend Extractor] Processing document: ${knowledgeData.title || knowledgeItemId}`);
// Call LLM with structured extraction + thinking mode
const extraction = await llm.structuredCall<ExtractionOutput>({
model: 'gemini',
systemPrompt: BACKEND_EXTRACTOR_SYSTEM_PROMPT,
messages: [{
role: 'user',
content: `Document Title: ${knowledgeData.title || 'Untitled'}\nSource Type: ${knowledgeData.sourceType}\n\nContent:\n${knowledgeData.content}`,
}],
schema: ExtractionOutputSchema as any,
temperature: 1.0, // Gemini 3 default (changed from 0.3)
thinking_config: {
thinking_level: 'high', // Enable deep reasoning for document analysis
include_thoughts: false, // Don't include thought tokens in output (saves cost)
},
});
// Add knowledgeItemId to each insight
extraction.insights.forEach(insight => {
insight.sourceKnowledgeItemId = knowledgeItemId;
});
allExtractionOutputs.push(extraction);
processedKnowledgeItemIds.push(knowledgeItemId);
// 4. Persist extraction to chat_extractions
await adminDb.collection('chat_extractions').add({
projectId,
knowledgeItemId,
data: extraction,
overallConfidence: extraction.overallConfidence,
overallCompletion: extraction.overallConfidence > 0.7 ? 0.9 : 0.6,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
});
console.log(`[Backend Extractor] Extracted ${extraction.insights.length} insights from ${knowledgeData.title || knowledgeItemId}`);
// 5. Write vector chunks for primary insights
const primaryInsights = extraction.insights.filter(i => i.importance === 'primary');
for (const insight of primaryInsights) {
try {
// Create a knowledge chunk for this insight
await writeKnowledgeChunksForItem({
id: knowledgeItemId,
projectId,
content: `${insight.title}\n\n${insight.description}\n\nSource: ${insight.sourceText}`,
sourceMeta: {
sourceType: 'extracted_insight',
importance: 'primary',
},
});
} catch (chunkError) {
console.error(`[Backend Extractor] Failed to write chunk for insight ${insight.id}:`, chunkError);
// Continue processing other insights
}
}
} catch (docError) {
console.error(`[Backend Extractor] Failed to process document ${knowledgeItemId}:`, docError);
// Continue with next document
}
}
// 6. Build extraction PhaseHandoff
// Flatten all extracted items (they're already strings, not objects)
const allProblems = [...new Set(allExtractionOutputs.flatMap(e => e.problems))];
const allUsers = [...new Set(allExtractionOutputs.flatMap(e => e.targetUsers))];
const allFeatures = [...new Set(allExtractionOutputs.flatMap(e => e.features))];
const allConstraints = [...new Set(allExtractionOutputs.flatMap(e => e.constraints))];
const allOpportunities = [...new Set(allExtractionOutputs.flatMap(e => e.opportunities))];
const allUncertainties = [...new Set(allExtractionOutputs.flatMap(e => e.uncertainties))];
const allMissing = [...new Set(allExtractionOutputs.flatMap(e => e.missingInformation))];
const avgConfidence = allExtractionOutputs.length > 0
? allExtractionOutputs.reduce((sum, e) => sum + e.overallConfidence, 0) / allExtractionOutputs.length
: 0;
const readyForNextPhase = allProblems.length > 0 && allFeatures.length > 0 && avgConfidence > 0.5;
const extractionHandoff: PhaseHandoff = {
phase: 'extraction',
readyForNextPhase,
confidence: avgConfidence,
confirmed: {
problems: allProblems,
targetUsers: allUsers,
features: allFeatures,
constraints: allConstraints,
opportunities: allOpportunities,
},
uncertain: {},
missing: allMissing,
questionsForUser: allUncertainties,
sourceEvidence: processedKnowledgeItemIds,
version: 'extraction_v1',
timestamp: new Date().toISOString(),
};
// 7. Persist handoff and update phase
await adminDb.collection('projects').doc(projectId).update({
'phaseData.phaseHandoffs.extraction': extractionHandoff,
currentPhase: 'extraction_review',
phaseStatus: 'in_progress',
'phaseData.extractionCompletedAt': new Date().toISOString(),
updatedAt: new Date().toISOString(),
});
console.log(`[Backend Extractor] ✅ Extraction complete for project ${projectId}`);
console.log(`[Backend Extractor] - Problems: ${allProblems.length}`);
console.log(`[Backend Extractor] - Users: ${allUsers.length}`);
console.log(`[Backend Extractor] - Features: ${allFeatures.length}`);
console.log(`[Backend Extractor] - Confidence: ${(avgConfidence * 100).toFixed(1)}%`);
console.log(`[Backend Extractor] - Ready for next phase: ${readyForNextPhase}`);
} catch (error) {
console.error(`[Backend Extractor] Fatal error during extraction:`, error);
throw error;
}
}