/** * Project brief extraction. * Closes BETA_LAUNCH_PLAN P3.7. * * When a user uploads a PDF / .md / .docx / .txt brief file, we extract * the text here and store it on `fs_projects.data.plan.brief`. The * `buildSystemPrompt` function in `app/api/chat/route.ts` then surfaces * it in the [PROJECT BRIEF] block. * * Supports: * - .txt / .md — read as-is * - .pdf — extract text via pdf.js (no binary install required) * - .docx — extract via unzipper + xml text nodes * - .html / .htm — strip tags * * 5 MB max, 50 000 chars after extraction (truncated with a note). */ import { query } from "@/lib/db-postgres"; import { log } from "@/lib/server/logger"; export const BRIEF_MAX_CHARS = 50_000; export const BRIEF_MAX_BYTES = 5 * 1024 * 1024; export type BriefExtractionResult = | { ok: true; text: string; truncated: boolean; chars: number } | { ok: false; error: string }; /** * Extract plain text from a File-like object. * Call from `POST /api/projects/[projectId]/documents/upload`. */ export async function extractBriefText( buffer: Buffer, mimeType: string, filename: string, ): Promise { if (buffer.byteLength > BRIEF_MAX_BYTES) { return { ok: false, error: `File is too large (max 5 MB)` }; } try { let text = ""; const lower = filename.toLowerCase(); if (lower.endsWith(".pdf") || mimeType === "application/pdf") { text = await extractPdf(buffer); } else if ( lower.endsWith(".docx") || mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) { text = await extractDocx(buffer); } else if (lower.endsWith(".html") || lower.endsWith(".htm")) { text = buffer.toString("utf8").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim(); } else { // .txt, .md, plain text text = buffer.toString("utf8"); } text = text.trim(); const truncated = text.length > BRIEF_MAX_CHARS; if (truncated) { text = text.slice(0, BRIEF_MAX_CHARS) + `\n\n[Brief truncated at ${BRIEF_MAX_CHARS} chars — upload a shorter document for full coverage]`; } return { ok: true, text, truncated, chars: text.length }; } catch (err) { return { ok: false, error: `Extraction failed: ${err instanceof Error ? err.message : String(err)}`, }; } } async function extractPdf(buffer: Buffer): Promise { // Dynamic import — pdf-parse is a large optional dep. // If not installed, fall back to an error message. try { // eslint-disable-next-line @typescript-eslint/no-require-imports const pdfParse = require("pdf-parse") as ( b: Buffer, ) => Promise<{ text: string }>; const result = await pdfParse(buffer); return result.text; } catch (e: unknown) { if ( e instanceof Error && e.message.includes("Cannot find module") ) { throw new Error( "pdf-parse package not installed. Run `npm install pdf-parse` or upload a .txt/.md file instead.", ); } throw e; } } async function extractDocx(buffer: Buffer): Promise { try { // eslint-disable-next-line @typescript-eslint/no-require-imports const { DOMParser } = require("@xmldom/xmldom") as { DOMParser: new () => { parseFromString(xml: string, type: string): Document }; }; // eslint-disable-next-line @typescript-eslint/no-require-imports const unzipper = require("unzipper") as { Open: { buffer(b: Buffer): Promise<{ files: Array<{ path: string; buffer(): Promise }> }>; }; }; const directory = await unzipper.Open.buffer(buffer); const wordDoc = directory.files.find( (f: { path: string }) => f.path === "word/document.xml", ); if (!wordDoc) throw new Error("word/document.xml not found in docx"); const xmlBuf = await wordDoc.buffer(); const xml = xmlBuf.toString("utf8"); const doc = new DOMParser().parseFromString(xml, "text/xml"); const texts: string[] = []; function extractText(node: Node) { if (node.nodeType === 3 /* TEXT_NODE */) { const t = (node as Text).textContent?.trim(); if (t) texts.push(t); } node.childNodes?.forEach((child: Node) => extractText(child)); } extractText(doc); return texts.join(" "); } catch (e: unknown) { if (e instanceof Error && e.message.includes("Cannot find module")) { throw new Error( "unzipper or @xmldom/xmldom not installed. Upload a .txt or .md file instead.", ); } throw e; } } /** * Persist the extracted brief text to `fs_projects.data.plan.brief`. * Called by the upload route after extraction succeeds. */ export async function persistProjectBrief( projectId: string, text: string, meta: { filename: string; chars: number; truncated: boolean }, ): Promise { try { await query( `UPDATE fs_projects SET data = jsonb_set( data, '{plan}', COALESCE(data->'plan', '{}'::jsonb) || jsonb_build_object( 'brief', $1::text, 'briefMeta', $2::jsonb ), true ) WHERE id = $3`, [ text, JSON.stringify({ ...meta, uploadedAt: new Date().toISOString(), }), projectId, ], ); log.info("project brief persisted", { projectId, chars: meta.chars }); } catch (err) { log.error("brief persist failed", { projectId, err: err instanceof Error ? err.message : String(err), }); throw err; } }