/** * Backend Extraction Module * * Runs extraction as a pure backend job, not in chat. * Called when Collector phase completes. */ import { getAdminDb } from '@/lib/firebase/admin'; import { GeminiLlmClient } from '@/lib/ai/gemini-client'; import { BACKEND_EXTRACTOR_SYSTEM_PROMPT } from '@/lib/ai/prompts/extractor'; import { writeKnowledgeChunksForItem } from '@/lib/server/vector-memory'; import type { ExtractionOutput, ExtractedInsight } from '@/lib/types/extraction-output'; import type { PhaseHandoff } from '@/lib/types/phase-handoff'; import { z } from 'zod'; const ExtractionOutputSchema = z.object({ insights: z.array(z.object({ id: z.string(), type: z.enum(["problem", "user", "feature", "constraint", "opportunity", "other"]), title: z.string(), description: z.string(), sourceText: z.string(), sourceKnowledgeItemId: z.string(), importance: z.enum(["primary", "supporting"]), confidence: z.number().min(0).max(1), })), problems: z.array(z.string()), targetUsers: z.array(z.string()), features: z.array(z.string()), constraints: z.array(z.string()), opportunities: z.array(z.string()), uncertainties: z.array(z.string()), missingInformation: z.array(z.string()), overallConfidence: z.number().min(0).max(1), }); export async function runBackendExtractionForProject(projectId: string): Promise { console.log(`[Backend Extractor] Starting extraction for project ${projectId}`); const adminDb = getAdminDb(); try { // 1. Load project const projectDoc = await adminDb.collection('projects').doc(projectId).get(); if (!projectDoc.exists) { throw new Error(`Project ${projectId} not found`); } const projectData = projectDoc.data(); // 2. Load knowledge items const knowledgeSnapshot = await adminDb .collection('knowledge_items') .where('projectId', '==', projectId) .where('sourceType', '==', 'imported_document') .get(); if (knowledgeSnapshot.empty) { console.log(`[Backend Extractor] No documents to extract for project ${projectId} - creating empty handoff`); // Create a minimal extraction handoff even with no documents const emptyHandoff: PhaseHandoff = { phase: 'extraction', readyForNextPhase: false, // Not ready - no materials to extract from confidence: 0, confirmed: { problems: [], targetUsers: [], features: [], constraints: [], opportunities: [], }, uncertain: {}, missing: ['No documents uploaded - need product requirements, specs, or notes'], questionsForUser: [ 'You haven\'t uploaded any documents yet. Do you have any product specs, requirements, or notes to share?', ], sourceEvidence: [], version: 'extraction_v1', timestamp: new Date().toISOString(), }; await adminDb.collection('projects').doc(projectId).update({ 'phaseData.phaseHandoffs.extraction': emptyHandoff, currentPhase: 'extraction_review', phaseStatus: 'in_progress', 'phaseData.extractionCompletedAt': new Date().toISOString(), updatedAt: new Date().toISOString(), }); console.log(`[Backend Extractor] Set phase to extraction_review with empty handoff`); return; } console.log(`[Backend Extractor] Found ${knowledgeSnapshot.size} documents to process`); const llm = new GeminiLlmClient(); const allExtractionOutputs: ExtractionOutput[] = []; const processedKnowledgeItemIds: string[] = []; // 3. Process each document for (const knowledgeDoc of knowledgeSnapshot.docs) { const knowledgeData = knowledgeDoc.data(); const knowledgeItemId = knowledgeDoc.id; try { console.log(`[Backend Extractor] Processing document: ${knowledgeData.title || knowledgeItemId}`); // Call LLM with structured extraction + thinking mode const extraction = await llm.structuredCall({ model: 'gemini', systemPrompt: BACKEND_EXTRACTOR_SYSTEM_PROMPT, messages: [{ role: 'user', content: `Document Title: ${knowledgeData.title || 'Untitled'}\nSource Type: ${knowledgeData.sourceType}\n\nContent:\n${knowledgeData.content}`, }], schema: ExtractionOutputSchema as any, temperature: 1.0, // Gemini 3 default (changed from 0.3) thinking_config: { thinking_level: 'high', // Enable deep reasoning for document analysis include_thoughts: false, // Don't include thought tokens in output (saves cost) }, }); // Add knowledgeItemId to each insight extraction.insights.forEach(insight => { insight.sourceKnowledgeItemId = knowledgeItemId; }); allExtractionOutputs.push(extraction); processedKnowledgeItemIds.push(knowledgeItemId); // 4. Persist extraction to chat_extractions await adminDb.collection('chat_extractions').add({ projectId, knowledgeItemId, data: extraction, overallConfidence: extraction.overallConfidence, overallCompletion: extraction.overallConfidence > 0.7 ? 0.9 : 0.6, createdAt: new Date().toISOString(), updatedAt: new Date().toISOString(), }); console.log(`[Backend Extractor] Extracted ${extraction.insights.length} insights from ${knowledgeData.title || knowledgeItemId}`); // 5. Write vector chunks for primary insights const primaryInsights = extraction.insights.filter(i => i.importance === 'primary'); for (const insight of primaryInsights) { try { // Create a knowledge chunk for this insight await writeKnowledgeChunksForItem({ id: knowledgeItemId, projectId, content: `${insight.title}\n\n${insight.description}\n\nSource: ${insight.sourceText}`, sourceMeta: { sourceType: 'extracted_insight', importance: 'primary', }, }); } catch (chunkError) { console.error(`[Backend Extractor] Failed to write chunk for insight ${insight.id}:`, chunkError); // Continue processing other insights } } } catch (docError) { console.error(`[Backend Extractor] Failed to process document ${knowledgeItemId}:`, docError); // Continue with next document } } // 6. Build extraction PhaseHandoff // Flatten all extracted items (they're already strings, not objects) const allProblems = [...new Set(allExtractionOutputs.flatMap(e => e.problems))]; const allUsers = [...new Set(allExtractionOutputs.flatMap(e => e.targetUsers))]; const allFeatures = [...new Set(allExtractionOutputs.flatMap(e => e.features))]; const allConstraints = [...new Set(allExtractionOutputs.flatMap(e => e.constraints))]; const allOpportunities = [...new Set(allExtractionOutputs.flatMap(e => e.opportunities))]; const allUncertainties = [...new Set(allExtractionOutputs.flatMap(e => e.uncertainties))]; const allMissing = [...new Set(allExtractionOutputs.flatMap(e => e.missingInformation))]; const avgConfidence = allExtractionOutputs.length > 0 ? allExtractionOutputs.reduce((sum, e) => sum + e.overallConfidence, 0) / allExtractionOutputs.length : 0; const readyForNextPhase = allProblems.length > 0 && allFeatures.length > 0 && avgConfidence > 0.5; const extractionHandoff: PhaseHandoff = { phase: 'extraction', readyForNextPhase, confidence: avgConfidence, confirmed: { problems: allProblems, targetUsers: allUsers, features: allFeatures, constraints: allConstraints, opportunities: allOpportunities, }, uncertain: {}, missing: allMissing, questionsForUser: allUncertainties, sourceEvidence: processedKnowledgeItemIds, version: 'extraction_v1', timestamp: new Date().toISOString(), }; // 7. Persist handoff and update phase await adminDb.collection('projects').doc(projectId).update({ 'phaseData.phaseHandoffs.extraction': extractionHandoff, currentPhase: 'extraction_review', phaseStatus: 'in_progress', 'phaseData.extractionCompletedAt': new Date().toISOString(), updatedAt: new Date().toISOString(), }); console.log(`[Backend Extractor] ✅ Extraction complete for project ${projectId}`); console.log(`[Backend Extractor] - Problems: ${allProblems.length}`); console.log(`[Backend Extractor] - Users: ${allUsers.length}`); console.log(`[Backend Extractor] - Features: ${allFeatures.length}`); console.log(`[Backend Extractor] - Confidence: ${(avgConfidence * 100).toFixed(1)}%`); console.log(`[Backend Extractor] - Ready for next phase: ${readyForNextPhase}`); } catch (error) { console.error(`[Backend Extractor] Fatal error during extraction:`, error); throw error; } }