70 lines
1.9 KiB
TypeScript
70 lines
1.9 KiB
TypeScript
const MAX_TEXT_STORE = 120_000;
|
|
const MAX_PDF_BYTES = 5 * 1024 * 1024;
|
|
|
|
/** Payload sent from Build wizard client (`SeedDocumentUpload`). */
|
|
export type SeedDocumentRequestBody = {
|
|
fileName: string;
|
|
kind: "markdown" | "pdf";
|
|
text?: string;
|
|
base64?: string;
|
|
};
|
|
|
|
/** Persisted on project `kickoff.sourceData.seedDocument` (extracted text only). */
|
|
export type SeedDocumentPersisted = {
|
|
fileName: string;
|
|
kind: "markdown" | "pdf";
|
|
textExtract: string;
|
|
};
|
|
|
|
export async function normalizeSeedDocument(
|
|
raw: unknown,
|
|
): Promise<SeedDocumentPersisted | null> {
|
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) return null;
|
|
|
|
const o = raw as Record<string, unknown>;
|
|
const fileNameRaw = typeof o.fileName === "string" ? o.fileName.trim() : "";
|
|
const fileName = fileNameRaw.length > 0 ? fileNameRaw.slice(0, 240) : "document";
|
|
|
|
const kind =
|
|
o.kind === "pdf" ? "pdf" : o.kind === "markdown" ? "markdown" : null;
|
|
if (!kind) return null;
|
|
|
|
if (kind === "markdown") {
|
|
const text = typeof o.text === "string" ? o.text : "";
|
|
const trimmed = text.trim();
|
|
if (!trimmed) return null;
|
|
return {
|
|
fileName,
|
|
kind: "markdown",
|
|
textExtract: trimmed.slice(0, MAX_TEXT_STORE),
|
|
};
|
|
}
|
|
|
|
const b64 = typeof o.base64 === "string" ? o.base64.replace(/\s/g, "") : "";
|
|
if (!b64) return null;
|
|
|
|
let buf: Buffer;
|
|
try {
|
|
buf = Buffer.from(b64, "base64");
|
|
} catch {
|
|
return null;
|
|
}
|
|
|
|
if (buf.length > MAX_PDF_BYTES) {
|
|
throw new Error("PDF exceeds 5MB limit");
|
|
}
|
|
|
|
const pdfParseMod = await import("pdf-parse");
|
|
const pdfParse = pdfParseMod.default ?? pdfParseMod;
|
|
const parsed = await pdfParse(buf);
|
|
const extracted = typeof parsed?.text === "string" ? parsed.text.trim() : "";
|
|
return {
|
|
fileName,
|
|
kind: "pdf",
|
|
textExtract:
|
|
extracted.length > 0
|
|
? extracted.slice(0, MAX_TEXT_STORE)
|
|
: "[No extractable text in PDF]",
|
|
};
|
|
}
|