229 lines
9.0 KiB
TypeScript
229 lines
9.0 KiB
TypeScript
/**
|
|
* Backend Extraction Module
|
|
*
|
|
* Runs extraction as a pure backend job, not in chat.
|
|
* Called when Collector phase completes.
|
|
*/
|
|
|
|
import { getAdminDb } from '@/lib/firebase/admin';
|
|
import { GeminiLlmClient } from '@/lib/ai/gemini-client';
|
|
import { BACKEND_EXTRACTOR_SYSTEM_PROMPT } from '@/lib/ai/prompts/extractor';
|
|
import { writeKnowledgeChunksForItem } from '@/lib/server/vector-memory';
|
|
import type { ExtractionOutput, ExtractedInsight } from '@/lib/types/extraction-output';
|
|
import type { PhaseHandoff } from '@/lib/types/phase-handoff';
|
|
import { z } from 'zod';
|
|
|
|
const ExtractionOutputSchema = z.object({
|
|
insights: z.array(z.object({
|
|
id: z.string(),
|
|
type: z.enum(["problem", "user", "feature", "constraint", "opportunity", "other"]),
|
|
title: z.string(),
|
|
description: z.string(),
|
|
sourceText: z.string(),
|
|
sourceKnowledgeItemId: z.string(),
|
|
importance: z.enum(["primary", "supporting"]),
|
|
confidence: z.number().min(0).max(1),
|
|
})),
|
|
problems: z.array(z.string()),
|
|
targetUsers: z.array(z.string()),
|
|
features: z.array(z.string()),
|
|
constraints: z.array(z.string()),
|
|
opportunities: z.array(z.string()),
|
|
uncertainties: z.array(z.string()),
|
|
missingInformation: z.array(z.string()),
|
|
overallConfidence: z.number().min(0).max(1),
|
|
});
|
|
|
|
export async function runBackendExtractionForProject(projectId: string): Promise<void> {
|
|
console.log(`[Backend Extractor] Starting extraction for project ${projectId}`);
|
|
|
|
const adminDb = getAdminDb();
|
|
|
|
try {
|
|
// 1. Load project
|
|
const projectDoc = await adminDb.collection('projects').doc(projectId).get();
|
|
if (!projectDoc.exists) {
|
|
throw new Error(`Project ${projectId} not found`);
|
|
}
|
|
|
|
const projectData = projectDoc.data();
|
|
|
|
// 2. Load knowledge items
|
|
const knowledgeSnapshot = await adminDb
|
|
.collection('knowledge_items')
|
|
.where('projectId', '==', projectId)
|
|
.where('sourceType', '==', 'imported_document')
|
|
.get();
|
|
|
|
if (knowledgeSnapshot.empty) {
|
|
console.log(`[Backend Extractor] No documents to extract for project ${projectId} - creating empty handoff`);
|
|
|
|
// Create a minimal extraction handoff even with no documents
|
|
const emptyHandoff: PhaseHandoff = {
|
|
phase: 'extraction',
|
|
readyForNextPhase: false, // Not ready - no materials to extract from
|
|
confidence: 0,
|
|
confirmed: {
|
|
problems: [],
|
|
targetUsers: [],
|
|
features: [],
|
|
constraints: [],
|
|
opportunities: [],
|
|
},
|
|
uncertain: {},
|
|
missing: ['No documents uploaded - need product requirements, specs, or notes'],
|
|
questionsForUser: [
|
|
'You haven\'t uploaded any documents yet. Do you have any product specs, requirements, or notes to share?',
|
|
],
|
|
sourceEvidence: [],
|
|
version: 'extraction_v1',
|
|
timestamp: new Date().toISOString(),
|
|
};
|
|
|
|
await adminDb.collection('projects').doc(projectId).update({
|
|
'phaseData.phaseHandoffs.extraction': emptyHandoff,
|
|
currentPhase: 'extraction_review',
|
|
phaseStatus: 'in_progress',
|
|
'phaseData.extractionCompletedAt': new Date().toISOString(),
|
|
updatedAt: new Date().toISOString(),
|
|
});
|
|
|
|
console.log(`[Backend Extractor] Set phase to extraction_review with empty handoff`);
|
|
return;
|
|
}
|
|
|
|
console.log(`[Backend Extractor] Found ${knowledgeSnapshot.size} documents to process`);
|
|
|
|
const llm = new GeminiLlmClient();
|
|
const allExtractionOutputs: ExtractionOutput[] = [];
|
|
const processedKnowledgeItemIds: string[] = [];
|
|
|
|
// 3. Process each document
|
|
for (const knowledgeDoc of knowledgeSnapshot.docs) {
|
|
const knowledgeData = knowledgeDoc.data();
|
|
const knowledgeItemId = knowledgeDoc.id;
|
|
|
|
try {
|
|
console.log(`[Backend Extractor] Processing document: ${knowledgeData.title || knowledgeItemId}`);
|
|
|
|
// Call LLM with structured extraction + thinking mode
|
|
const extraction = await llm.structuredCall<ExtractionOutput>({
|
|
model: 'gemini',
|
|
systemPrompt: BACKEND_EXTRACTOR_SYSTEM_PROMPT,
|
|
messages: [{
|
|
role: 'user',
|
|
content: `Document Title: ${knowledgeData.title || 'Untitled'}\nSource Type: ${knowledgeData.sourceType}\n\nContent:\n${knowledgeData.content}`,
|
|
}],
|
|
schema: ExtractionOutputSchema as any,
|
|
temperature: 1.0, // Gemini 3 default (changed from 0.3)
|
|
thinking_config: {
|
|
thinking_level: 'high', // Enable deep reasoning for document analysis
|
|
include_thoughts: false, // Don't include thought tokens in output (saves cost)
|
|
},
|
|
});
|
|
|
|
// Add knowledgeItemId to each insight
|
|
extraction.insights.forEach(insight => {
|
|
insight.sourceKnowledgeItemId = knowledgeItemId;
|
|
});
|
|
|
|
allExtractionOutputs.push(extraction);
|
|
processedKnowledgeItemIds.push(knowledgeItemId);
|
|
|
|
// 4. Persist extraction to chat_extractions
|
|
await adminDb.collection('chat_extractions').add({
|
|
projectId,
|
|
knowledgeItemId,
|
|
data: extraction,
|
|
overallConfidence: extraction.overallConfidence,
|
|
overallCompletion: extraction.overallConfidence > 0.7 ? 0.9 : 0.6,
|
|
createdAt: new Date().toISOString(),
|
|
updatedAt: new Date().toISOString(),
|
|
});
|
|
|
|
console.log(`[Backend Extractor] Extracted ${extraction.insights.length} insights from ${knowledgeData.title || knowledgeItemId}`);
|
|
|
|
// 5. Write vector chunks for primary insights
|
|
const primaryInsights = extraction.insights.filter(i => i.importance === 'primary');
|
|
for (const insight of primaryInsights) {
|
|
try {
|
|
// Create a knowledge chunk for this insight
|
|
await writeKnowledgeChunksForItem({
|
|
id: knowledgeItemId,
|
|
projectId,
|
|
content: `${insight.title}\n\n${insight.description}\n\nSource: ${insight.sourceText}`,
|
|
sourceMeta: {
|
|
sourceType: 'extracted_insight',
|
|
importance: 'primary',
|
|
},
|
|
});
|
|
} catch (chunkError) {
|
|
console.error(`[Backend Extractor] Failed to write chunk for insight ${insight.id}:`, chunkError);
|
|
// Continue processing other insights
|
|
}
|
|
}
|
|
|
|
} catch (docError) {
|
|
console.error(`[Backend Extractor] Failed to process document ${knowledgeItemId}:`, docError);
|
|
// Continue with next document
|
|
}
|
|
}
|
|
|
|
// 6. Build extraction PhaseHandoff
|
|
// Flatten all extracted items (they're already strings, not objects)
|
|
const allProblems = [...new Set(allExtractionOutputs.flatMap(e => e.problems))];
|
|
const allUsers = [...new Set(allExtractionOutputs.flatMap(e => e.targetUsers))];
|
|
const allFeatures = [...new Set(allExtractionOutputs.flatMap(e => e.features))];
|
|
const allConstraints = [...new Set(allExtractionOutputs.flatMap(e => e.constraints))];
|
|
const allOpportunities = [...new Set(allExtractionOutputs.flatMap(e => e.opportunities))];
|
|
const allUncertainties = [...new Set(allExtractionOutputs.flatMap(e => e.uncertainties))];
|
|
const allMissing = [...new Set(allExtractionOutputs.flatMap(e => e.missingInformation))];
|
|
|
|
const avgConfidence = allExtractionOutputs.length > 0
|
|
? allExtractionOutputs.reduce((sum, e) => sum + e.overallConfidence, 0) / allExtractionOutputs.length
|
|
: 0;
|
|
|
|
const readyForNextPhase = allProblems.length > 0 && allFeatures.length > 0 && avgConfidence > 0.5;
|
|
|
|
const extractionHandoff: PhaseHandoff = {
|
|
phase: 'extraction',
|
|
readyForNextPhase,
|
|
confidence: avgConfidence,
|
|
confirmed: {
|
|
problems: allProblems,
|
|
targetUsers: allUsers,
|
|
features: allFeatures,
|
|
constraints: allConstraints,
|
|
opportunities: allOpportunities,
|
|
},
|
|
uncertain: {},
|
|
missing: allMissing,
|
|
questionsForUser: allUncertainties,
|
|
sourceEvidence: processedKnowledgeItemIds,
|
|
version: 'extraction_v1',
|
|
timestamp: new Date().toISOString(),
|
|
};
|
|
|
|
// 7. Persist handoff and update phase
|
|
await adminDb.collection('projects').doc(projectId).update({
|
|
'phaseData.phaseHandoffs.extraction': extractionHandoff,
|
|
currentPhase: 'extraction_review',
|
|
phaseStatus: 'in_progress',
|
|
'phaseData.extractionCompletedAt': new Date().toISOString(),
|
|
updatedAt: new Date().toISOString(),
|
|
});
|
|
|
|
console.log(`[Backend Extractor] ✅ Extraction complete for project ${projectId}`);
|
|
console.log(`[Backend Extractor] - Problems: ${allProblems.length}`);
|
|
console.log(`[Backend Extractor] - Users: ${allUsers.length}`);
|
|
console.log(`[Backend Extractor] - Features: ${allFeatures.length}`);
|
|
console.log(`[Backend Extractor] - Confidence: ${(avgConfidence * 100).toFixed(1)}%`);
|
|
console.log(`[Backend Extractor] - Ready for next phase: ${readyForNextPhase}`);
|
|
|
|
} catch (error) {
|
|
console.error(`[Backend Extractor] Fatal error during extraction:`, error);
|
|
throw error;
|
|
}
|
|
}
|
|
|