Skip to content

Commit faeb3e9

Browse files
committed
feat: md better
1 parent b321769 commit faeb3e9

3 files changed

Lines changed: 251 additions & 11 deletions

File tree

app/api/pdf/markdown/route.ts

Lines changed: 248 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { NextResponse, type NextRequest } from "next/server";
1+
import { after, NextResponse, type NextRequest } from "next/server";
22
import { Output, streamText } from "ai";
33
import { z } from "zod";
44
import { auth } from "@/app/auth";
@@ -10,8 +10,14 @@ import {
1010
getPdfMarkdownModel,
1111
} from "@/lib/ai/pdf-markdown";
1212
import type { PdfPaperQuestion } from "@/lib/ai/pdf-markdown";
13+
import {
14+
capturePostHogAiGeneration,
15+
createAiTextMessage,
16+
} from "@/lib/posthog/llm";
1317

1418
const MAX_PDF_MARKDOWN_BYTES = 24 * 1024 * 1024;
19+
const PDF_MARKDOWN_MAX_OUTPUT_TOKENS = 12000;
20+
const POSTHOG_AI_TEXT_LIMIT = 8000;
1521
const PDF_MARKDOWN_SYSTEM_PROMPT = [
1622
"You are a careful transcription engine for ExamCooker question-paper PDFs.",
1723
"Extract only the exam questions. Ignore cover-page metadata, institution/course details, course code, course name, slot, registration fields, faculty names, course outcomes, page separators, general instructions, CO columns, and Bloom taxonomy columns.",
@@ -33,13 +39,47 @@ const PDF_MARKDOWN_SYSTEM_PROMPT = [
3339
const PdfMarkdownRequestSchema = z.object({
3440
fileName: z.string().trim().min(1).max(240),
3541
fileUrl: z.string().trim().url(),
42+
posthogSessionId: z.string().trim().min(1).max(200).nullable().optional(),
3643
});
3744

3845
type AllowedPdfSource = {
3946
origin: string;
4047
pathPrefix: string;
4148
};
4249

50+
type AiUsageSummary = {
51+
inputTokens?: number;
52+
outputTokens?: number;
53+
};
54+
55+
type AiResponseSummary = {
56+
id?: string;
57+
};
58+
59+
type PdfMarkdownAiCaptureEvent = {
60+
distinctId: string;
61+
error?: unknown;
62+
fileBytes: number;
63+
fileName: string;
64+
fileUrl: string;
65+
finishReason?: PromiseLike<string | undefined> | null;
66+
httpStatus: number;
67+
isError: boolean;
68+
latencySeconds: number;
69+
markdown?: string;
70+
modelId: string;
71+
outputWasTruncated?: boolean;
72+
provider: string;
73+
questionCount?: number;
74+
response?: PromiseLike<AiResponseSummary> | null;
75+
sessionId?: string;
76+
spanId: string;
77+
timeToFirstQuestionSeconds?: number;
78+
traceId: string;
79+
usage?: PromiseLike<AiUsageSummary> | null;
80+
userPrompt: string;
81+
};
82+
4383
function getAzureBaseUrlFromEnv() {
4484
const explicitBaseUrl = process.env.AZURE_BLOB_PUBLIC_BASE_URL?.trim();
4585
if (explicitBaseUrl) {
@@ -203,6 +243,107 @@ function getStreamErrorMessage(error: unknown, streamError: unknown) {
203243
return fallbackMessage || "Failed to convert this PDF to Markdown.";
204244
}
205245

246+
function getAiProviderFromModel(modelId: string) {
247+
const [provider] = modelId.split("/");
248+
return provider && provider !== modelId ? provider : "openai";
249+
}
250+
251+
function truncateForPostHogAiText(text: string) {
252+
if (text.length <= POSTHOG_AI_TEXT_LIMIT) {
253+
return {
254+
text,
255+
wasTruncated: false,
256+
};
257+
}
258+
259+
return {
260+
text: text.slice(0, POSTHOG_AI_TEXT_LIMIT),
261+
wasTruncated: true,
262+
};
263+
}
264+
265+
async function safeAwait<T>(promise: PromiseLike<T> | null | undefined) {
266+
if (!promise) {
267+
return null;
268+
}
269+
270+
try {
271+
return await promise;
272+
} catch {
273+
return null;
274+
}
275+
}
276+
277+
function schedulePdfMarkdownAiCapture(
278+
captureEventPromise: Promise<PdfMarkdownAiCaptureEvent | null>,
279+
) {
280+
after(async () => {
281+
const captureEvent = await captureEventPromise.catch(() => null);
282+
if (!captureEvent) {
283+
return;
284+
}
285+
286+
const usage = await safeAwait(captureEvent.usage);
287+
const response = await safeAwait(captureEvent.response);
288+
const finishReason = await safeAwait(captureEvent.finishReason);
289+
const outputText = captureEvent.markdown
290+
? truncateForPostHogAiText(captureEvent.markdown)
291+
: null;
292+
293+
await capturePostHogAiGeneration({
294+
distinctId: captureEvent.distinctId,
295+
traceId: captureEvent.traceId,
296+
sessionId: captureEvent.sessionId,
297+
spanId: response?.id ?? captureEvent.spanId,
298+
spanName: "pdf_markdown_extraction",
299+
model: captureEvent.modelId,
300+
provider: captureEvent.provider,
301+
input: [
302+
createAiTextMessage("system", PDF_MARKDOWN_SYSTEM_PROMPT),
303+
{
304+
role: "user",
305+
content: [
306+
{
307+
type: "text",
308+
text: captureEvent.userPrompt,
309+
},
310+
{
311+
type: "file",
312+
mediaType: "application/pdf",
313+
filename: getSafePdfFileName(captureEvent.fileName),
314+
bytes: captureEvent.fileBytes,
315+
},
316+
],
317+
},
318+
],
319+
inputTokens: usage?.inputTokens,
320+
outputChoices: outputText
321+
? [createAiTextMessage("assistant", outputText.text)]
322+
: undefined,
323+
outputTokens: usage?.outputTokens,
324+
latencySeconds: captureEvent.latencySeconds,
325+
httpStatus: captureEvent.httpStatus,
326+
isError: captureEvent.isError,
327+
error: captureEvent.error,
328+
stopReason:
329+
finishReason ?? (captureEvent.isError ? "error" : undefined),
330+
stream: true,
331+
maxTokens: PDF_MARKDOWN_MAX_OUTPUT_TOKENS,
332+
extraProperties: {
333+
ai_surface: "pdf_markdown",
334+
pdf_markdown_file_bytes: captureEvent.fileBytes,
335+
pdf_markdown_file_name: captureEvent.fileName,
336+
pdf_markdown_file_url: captureEvent.fileUrl,
337+
pdf_markdown_output_truncated:
338+
captureEvent.outputWasTruncated ?? outputText?.wasTruncated,
339+
pdf_markdown_question_count: captureEvent.questionCount,
340+
pdf_markdown_time_to_first_question:
341+
captureEvent.timeToFirstQuestionSeconds,
342+
},
343+
});
344+
});
345+
}
346+
206347
async function fetchPdfBuffer(fileUrl: URL) {
207348
const response = await fetch(fileUrl, {
208349
cache: "no-store",
@@ -237,7 +378,8 @@ async function fetchPdfBuffer(fileUrl: URL) {
237378

238379
export async function POST(request: NextRequest) {
239380
const session = await auth();
240-
if (!session?.user?.email) {
381+
const distinctId = session?.user?.id ?? session?.user?.email ?? null;
382+
if (!distinctId) {
241383
return NextResponse.json(
242384
{
243385
error: "You must be signed in to convert PDFs to Markdown.",
@@ -309,9 +451,36 @@ export async function POST(request: NextRequest) {
309451

310452
const model = getPdfMarkdownLanguageModel();
311453
const modelId = getPdfMarkdownModel();
454+
const provider = getAiProviderFromModel(modelId);
455+
const traceId = crypto.randomUUID();
456+
const spanId = crypto.randomUUID();
457+
const userPrompt =
458+
`Extract only the questions from ${parsedBody.fileName}. ` +
459+
"Return no metadata and no instructions.";
312460

313461
try {
314462
let streamError: unknown = null;
463+
const llmStartedAt = Date.now();
464+
let firstQuestionAt: number | null = null;
465+
let resolveCaptureEvent:
466+
| ((event: PdfMarkdownAiCaptureEvent | null) => void)
467+
| null = null;
468+
const captureEventPromise = new Promise<PdfMarkdownAiCaptureEvent | null>(
469+
(resolve) => {
470+
resolveCaptureEvent = resolve;
471+
},
472+
);
473+
const resolveCaptureEventOnce = (
474+
event: PdfMarkdownAiCaptureEvent | null,
475+
) => {
476+
if (!resolveCaptureEvent) {
477+
return;
478+
}
479+
480+
resolveCaptureEvent(event);
481+
resolveCaptureEvent = null;
482+
};
483+
315484
const result = streamText({
316485
model,
317486
system: PDF_MARKDOWN_SYSTEM_PROMPT,
@@ -321,9 +490,7 @@ export async function POST(request: NextRequest) {
321490
content: [
322491
{
323492
type: "text",
324-
text:
325-
`Extract only the questions from ${parsedBody.fileName}. ` +
326-
"Return no metadata and no instructions.",
493+
text: userPrompt,
327494
},
328495
{
329496
type: "file",
@@ -341,7 +508,7 @@ export async function POST(request: NextRequest) {
341508
"A faithful ordered list of only question numbers, question text, and marks.",
342509
}),
343510
abortSignal: request.signal,
344-
maxOutputTokens: 12000,
511+
maxOutputTokens: PDF_MARKDOWN_MAX_OUTPUT_TOKENS,
345512
experimental_include: {
346513
requestBody: false,
347514
},
@@ -356,6 +523,8 @@ export async function POST(request: NextRequest) {
356523
},
357524
});
358525

526+
schedulePdfMarkdownAiCapture(captureEventPromise);
527+
359528
const encoder = new TextEncoder();
360529

361530
const stream = new ReadableStream<Uint8Array>({
@@ -368,6 +537,10 @@ export async function POST(request: NextRequest) {
368537
const streamedQuestions: PdfPaperQuestion[] = [];
369538

370539
for await (const question of result.elementStream) {
540+
if (firstQuestionAt === null) {
541+
firstQuestionAt = Date.now();
542+
}
543+
371544
streamedQuestions.push(question);
372545
enqueue({
373546
type: "partial",
@@ -383,23 +556,89 @@ export async function POST(request: NextRequest) {
383556
schemaVersion: "exam-questions-v1",
384557
questions,
385558
});
559+
const markdown = buildPdfPaperMarkdown(paper);
386560
enqueue({
387561
type: "done",
388562
paper,
389-
markdown: buildPdfPaperMarkdown(paper),
563+
markdown,
390564
model: modelId,
391565
});
566+
resolveCaptureEventOnce({
567+
distinctId,
568+
fileBytes: pdfBuffer.byteLength,
569+
fileName: parsedBody.fileName,
570+
fileUrl: fileUrl.href,
571+
finishReason: result.finishReason,
572+
httpStatus: 200,
573+
isError: false,
574+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
575+
markdown,
576+
modelId,
577+
provider,
578+
questionCount: paper.questions.length,
579+
response: result.response,
580+
sessionId: parsedBody.posthogSessionId ?? undefined,
581+
spanId,
582+
timeToFirstQuestionSeconds:
583+
firstQuestionAt === null
584+
? undefined
585+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
586+
traceId,
587+
usage: result.totalUsage,
588+
userPrompt,
589+
});
392590
} catch (error) {
591+
const errorMessage = getStreamErrorMessage(error, streamError);
393592
enqueue({
394593
type: "error",
395-
error: getStreamErrorMessage(error, streamError),
594+
error: errorMessage,
595+
});
596+
resolveCaptureEventOnce({
597+
distinctId,
598+
error: errorMessage,
599+
fileBytes: pdfBuffer.byteLength,
600+
fileName: parsedBody.fileName,
601+
fileUrl: fileUrl.href,
602+
httpStatus: request.signal.aborted ? 499 : 500,
603+
isError: true,
604+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
605+
modelId,
606+
provider,
607+
sessionId: parsedBody.posthogSessionId ?? undefined,
608+
spanId,
609+
timeToFirstQuestionSeconds:
610+
firstQuestionAt === null
611+
? undefined
612+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
613+
traceId,
614+
userPrompt,
396615
});
397616
} finally {
617+
resolveCaptureEventOnce(null);
398618
controller.close();
399619
}
400620
},
401621
cancel() {
402-
request.signal.throwIfAborted();
622+
resolveCaptureEventOnce({
623+
distinctId,
624+
error: "PDF Markdown conversion was cancelled.",
625+
fileBytes: pdfBuffer.byteLength,
626+
fileName: parsedBody.fileName,
627+
fileUrl: fileUrl.href,
628+
httpStatus: 499,
629+
isError: true,
630+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
631+
modelId,
632+
provider,
633+
sessionId: parsedBody.posthogSessionId ?? undefined,
634+
spanId,
635+
timeToFirstQuestionSeconds:
636+
firstQuestionAt === null
637+
? undefined
638+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
639+
traceId,
640+
userPrompt,
641+
});
403642
},
404643
});
405644

app/components/pdfviewer.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ import { downloadPdfFile } from "@/lib/downloads/browser-downloads";
5555
import { getFallbackPdfFileName } from "@/lib/downloads/resource-names";
5656
import { invalidatePdfBuffer, loadPdfBuffer } from "@/lib/pdf/pdf-buffer-cache";
5757
import { usePreloadedPdfiumEngine } from "@/lib/pdf/pdfium-engine-cache";
58-
import { capturePdfDownloaded } from "@/lib/posthog/client";
58+
import { capturePdfDownloaded, getPostHogSessionId } from "@/lib/posthog/client";
5959
import {
6060
clearActivePdfSnapshot,
6161
setActivePdfSnapshot,
@@ -202,6 +202,7 @@ async function loadPdfPaper(input: {
202202
body: JSON.stringify({
203203
fileName: input.fileName,
204204
fileUrl: input.fileUrl,
205+
posthogSessionId: getPostHogSessionId(),
205206
}),
206207
cache: "no-store",
207208
signal: input.signal,

lib/ai/pdf-markdown.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { openai } from "@ai-sdk/openai";
22
import { z } from "zod";
33

4-
const DEFAULT_PDF_MARKDOWN_MODEL = "gpt-5.4-nano";
4+
const DEFAULT_PDF_MARKDOWN_MODEL = "gpt-5.4-mini";
55

66
function normalizeOpenAiModelId(modelId: string) {
77
return modelId.replace(/^openai\//, "");

0 commit comments

Comments
 (0)