Skip to content

Commit 1cddcb0

Browse files
authored
Merge pull request #398 from ACM-VIT/dev
Dev
2 parents 0c313be + faeb3e9 commit 1cddcb0

6 files changed

Lines changed: 400 additions & 67 deletions

File tree

app/(app)/past_papers/[code]/paper/[id]/page.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ async function PaperViewerContent({
289289
<div className="overflow-hidden border border-black/15 bg-white shadow-[0_4px_28px_-14px_rgba(0,0,0,0.25)] dark:border-[#D5D5D5]/15 dark:bg-[#0C1222] dark:shadow-[0_4px_28px_-14px_rgba(0,0,0,0.6)]">
290290
<div className="h-[70dvh] sm:h-[78dvh] lg:h-[84dvh] xl:h-[86dvh]">
291291
<PDFViewerClient
292+
enableQuestionMarkdown
292293
fileUrl={paper.fileUrl}
293294
fileName={downloadFileName}
294295
/>

app/api/pdf/markdown/route.ts

Lines changed: 260 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { NextResponse, type NextRequest } from "next/server";
1+
import { after, NextResponse, type NextRequest } from "next/server";
22
import { Output, streamText } from "ai";
33
import { z } from "zod";
44
import { auth } from "@/app/auth";
@@ -10,30 +10,76 @@ import {
1010
getPdfMarkdownModel,
1111
} from "@/lib/ai/pdf-markdown";
1212
import type { PdfPaperQuestion } from "@/lib/ai/pdf-markdown";
13+
import {
14+
capturePostHogAiGeneration,
15+
createAiTextMessage,
16+
} from "@/lib/posthog/llm";
1317

1418
const MAX_PDF_MARKDOWN_BYTES = 24 * 1024 * 1024;
19+
const PDF_MARKDOWN_MAX_OUTPUT_TOKENS = 12000;
20+
const POSTHOG_AI_TEXT_LIMIT = 8000;
1521
const PDF_MARKDOWN_SYSTEM_PROMPT = [
16-
"You extract only the exam questions from ExamCooker question-paper PDFs.",
17-
"Ignore and discard all cover-page metadata, institution/course details, course code, course name, slot, registration fields, faculty names, course outcomes, general instructions, page separators, CO columns, and Bloom taxonomy columns.",
22+
"You are a careful transcription engine for ExamCooker question-paper PDFs.",
23+
"Extract only the exam questions. Ignore cover-page metadata, institution/course details, course code, course name, slot, registration fields, faculty names, course outcomes, page separators, general instructions, CO columns, and Bloom taxonomy columns.",
24+
"Use only text that is visibly present in the PDF pages. Do not use the filename, course title, subject knowledge, expected exam patterns, or surrounding context to fill missing words, marks, formulas, or question numbers.",
1825
"Your output must contain only question number, question text, and marks.",
26+
"Read every page in visual order: top-to-bottom and left-to-right, following table rows and continuation lines carefully. Preserve the source order exactly.",
1927
"If the source has a table with columns like Q. No, Question, M, CO, and BL, keep Q. No as number, Question as text, M as marks, and drop CO/BL.",
20-
"Do not solve, summarize, explain, reword, or add content that is not present in the PDF.",
21-
"Preserve questions in reading order. Merge continuation lines or rows into the same question when a question spans pages.",
22-
"Preserve subparts, options, formulas, equations, and tables that are part of the question text. Use Markdown for math, lists, tables, and code where helpful.",
28+
"Keep question numbers and subpart labels exactly as shown, including forms like 1(a), 1. a), 2(i), or OR alternatives. Do not merge separate alternatives into one question unless the PDF clearly shows them as a single question block.",
29+
"Merge continuation lines or page-spanning rows only when they are clearly part of the same question. Do not merge unrelated rows, headings, instructions, or the next question into the current question.",
30+
"Transcribe the question text faithfully. Do not solve, summarize, explain, simplify, reword, correct grammar, normalize spelling, or add content that is not present in the PDF.",
31+
"Preserve symbols, variable names, numbers, options, units, punctuation, code, tables, formulas, and equations that are part of the question text. Use Markdown only to represent the visible structure more clearly.",
32+
"Pay special attention to visually similar characters in math, code, and identifiers, such as x/z, y/v, O/0, I/l/1, S/5, and Greek/Latin lookalikes. If a character is ambiguous, use `[illegible]` for that character or span instead of substituting the most likely one.",
2333
"For math, use valid LaTeX delimiters: `$...$` for inline math and `$$...$$` for display math. Do not escape the dollar delimiters and do not double-escape LaTeX backslashes.",
24-
"Use [illegible] only for characters or words that are truly unreadable. Do not guess missing marks.",
34+
"For marks, copy exactly the value shown in the marks or M column. If marks are missing, cropped, ambiguous, or only inferable from totals, use null. Never guess missing marks.",
35+
"If any word, number, symbol, formula, or mark is hard to read, write `[illegible]` for only that unreadable span instead of guessing. Prefer a small `[illegible]` marker over a confident but possibly wrong transcription.",
36+
"Before finishing, verify that the output contains no metadata, no instructions-only text, no CO/BL values, no solutions, no invented text, and no reordered questions.",
2537
].join("\n");
2638

2739
const PdfMarkdownRequestSchema = z.object({
2840
fileName: z.string().trim().min(1).max(240),
2941
fileUrl: z.string().trim().url(),
42+
posthogSessionId: z.string().trim().min(1).max(200).nullable().optional(),
3043
});
3144

3245
type AllowedPdfSource = {
3346
origin: string;
3447
pathPrefix: string;
3548
};
3649

50+
type AiUsageSummary = {
51+
inputTokens?: number;
52+
outputTokens?: number;
53+
};
54+
55+
type AiResponseSummary = {
56+
id?: string;
57+
};
58+
59+
type PdfMarkdownAiCaptureEvent = {
60+
distinctId: string;
61+
error?: unknown;
62+
fileBytes: number;
63+
fileName: string;
64+
fileUrl: string;
65+
finishReason?: PromiseLike<string | undefined> | null;
66+
httpStatus: number;
67+
isError: boolean;
68+
latencySeconds: number;
69+
markdown?: string;
70+
modelId: string;
71+
outputWasTruncated?: boolean;
72+
provider: string;
73+
questionCount?: number;
74+
response?: PromiseLike<AiResponseSummary> | null;
75+
sessionId?: string;
76+
spanId: string;
77+
timeToFirstQuestionSeconds?: number;
78+
traceId: string;
79+
usage?: PromiseLike<AiUsageSummary> | null;
80+
userPrompt: string;
81+
};
82+
3783
function getAzureBaseUrlFromEnv() {
3884
const explicitBaseUrl = process.env.AZURE_BLOB_PUBLIC_BASE_URL?.trim();
3985
if (explicitBaseUrl) {
@@ -197,6 +243,107 @@ function getStreamErrorMessage(error: unknown, streamError: unknown) {
197243
return fallbackMessage || "Failed to convert this PDF to Markdown.";
198244
}
199245

246+
function getAiProviderFromModel(modelId: string) {
247+
const [provider] = modelId.split("/");
248+
return provider && provider !== modelId ? provider : "openai";
249+
}
250+
251+
function truncateForPostHogAiText(text: string) {
252+
if (text.length <= POSTHOG_AI_TEXT_LIMIT) {
253+
return {
254+
text,
255+
wasTruncated: false,
256+
};
257+
}
258+
259+
return {
260+
text: text.slice(0, POSTHOG_AI_TEXT_LIMIT),
261+
wasTruncated: true,
262+
};
263+
}
264+
265+
async function safeAwait<T>(promise: PromiseLike<T> | null | undefined) {
266+
if (!promise) {
267+
return null;
268+
}
269+
270+
try {
271+
return await promise;
272+
} catch {
273+
return null;
274+
}
275+
}
276+
277+
function schedulePdfMarkdownAiCapture(
278+
captureEventPromise: Promise<PdfMarkdownAiCaptureEvent | null>,
279+
) {
280+
after(async () => {
281+
const captureEvent = await captureEventPromise.catch(() => null);
282+
if (!captureEvent) {
283+
return;
284+
}
285+
286+
const usage = await safeAwait(captureEvent.usage);
287+
const response = await safeAwait(captureEvent.response);
288+
const finishReason = await safeAwait(captureEvent.finishReason);
289+
const outputText = captureEvent.markdown
290+
? truncateForPostHogAiText(captureEvent.markdown)
291+
: null;
292+
293+
await capturePostHogAiGeneration({
294+
distinctId: captureEvent.distinctId,
295+
traceId: captureEvent.traceId,
296+
sessionId: captureEvent.sessionId,
297+
spanId: response?.id ?? captureEvent.spanId,
298+
spanName: "pdf_markdown_extraction",
299+
model: captureEvent.modelId,
300+
provider: captureEvent.provider,
301+
input: [
302+
createAiTextMessage("system", PDF_MARKDOWN_SYSTEM_PROMPT),
303+
{
304+
role: "user",
305+
content: [
306+
{
307+
type: "text",
308+
text: captureEvent.userPrompt,
309+
},
310+
{
311+
type: "file",
312+
mediaType: "application/pdf",
313+
filename: getSafePdfFileName(captureEvent.fileName),
314+
bytes: captureEvent.fileBytes,
315+
},
316+
],
317+
},
318+
],
319+
inputTokens: usage?.inputTokens,
320+
outputChoices: outputText
321+
? [createAiTextMessage("assistant", outputText.text)]
322+
: undefined,
323+
outputTokens: usage?.outputTokens,
324+
latencySeconds: captureEvent.latencySeconds,
325+
httpStatus: captureEvent.httpStatus,
326+
isError: captureEvent.isError,
327+
error: captureEvent.error,
328+
stopReason:
329+
finishReason ?? (captureEvent.isError ? "error" : undefined),
330+
stream: true,
331+
maxTokens: PDF_MARKDOWN_MAX_OUTPUT_TOKENS,
332+
extraProperties: {
333+
ai_surface: "pdf_markdown",
334+
pdf_markdown_file_bytes: captureEvent.fileBytes,
335+
pdf_markdown_file_name: captureEvent.fileName,
336+
pdf_markdown_file_url: captureEvent.fileUrl,
337+
pdf_markdown_output_truncated:
338+
captureEvent.outputWasTruncated ?? outputText?.wasTruncated,
339+
pdf_markdown_question_count: captureEvent.questionCount,
340+
pdf_markdown_time_to_first_question:
341+
captureEvent.timeToFirstQuestionSeconds,
342+
},
343+
});
344+
});
345+
}
346+
200347
async function fetchPdfBuffer(fileUrl: URL) {
201348
const response = await fetch(fileUrl, {
202349
cache: "no-store",
@@ -231,7 +378,8 @@ async function fetchPdfBuffer(fileUrl: URL) {
231378

232379
export async function POST(request: NextRequest) {
233380
const session = await auth();
234-
if (!session?.user?.email) {
381+
const distinctId = session?.user?.id ?? session?.user?.email ?? null;
382+
if (!distinctId) {
235383
return NextResponse.json(
236384
{
237385
error: "You must be signed in to convert PDFs to Markdown.",
@@ -303,9 +451,36 @@ export async function POST(request: NextRequest) {
303451

304452
const model = getPdfMarkdownLanguageModel();
305453
const modelId = getPdfMarkdownModel();
454+
const provider = getAiProviderFromModel(modelId);
455+
const traceId = crypto.randomUUID();
456+
const spanId = crypto.randomUUID();
457+
const userPrompt =
458+
`Extract only the questions from ${parsedBody.fileName}. ` +
459+
"Return no metadata and no instructions.";
306460

307461
try {
308462
let streamError: unknown = null;
463+
const llmStartedAt = Date.now();
464+
let firstQuestionAt: number | null = null;
465+
let resolveCaptureEvent:
466+
| ((event: PdfMarkdownAiCaptureEvent | null) => void)
467+
| null = null;
468+
const captureEventPromise = new Promise<PdfMarkdownAiCaptureEvent | null>(
469+
(resolve) => {
470+
resolveCaptureEvent = resolve;
471+
},
472+
);
473+
const resolveCaptureEventOnce = (
474+
event: PdfMarkdownAiCaptureEvent | null,
475+
) => {
476+
if (!resolveCaptureEvent) {
477+
return;
478+
}
479+
480+
resolveCaptureEvent(event);
481+
resolveCaptureEvent = null;
482+
};
483+
309484
const result = streamText({
310485
model,
311486
system: PDF_MARKDOWN_SYSTEM_PROMPT,
@@ -315,9 +490,7 @@ export async function POST(request: NextRequest) {
315490
content: [
316491
{
317492
type: "text",
318-
text:
319-
`Extract only the questions from ${parsedBody.fileName}. ` +
320-
"Return no metadata and no instructions.",
493+
text: userPrompt,
321494
},
322495
{
323496
type: "file",
@@ -335,7 +508,7 @@ export async function POST(request: NextRequest) {
335508
"A faithful ordered list of only question numbers, question text, and marks.",
336509
}),
337510
abortSignal: request.signal,
338-
maxOutputTokens: 12000,
511+
maxOutputTokens: PDF_MARKDOWN_MAX_OUTPUT_TOKENS,
339512
experimental_include: {
340513
requestBody: false,
341514
},
@@ -350,6 +523,8 @@ export async function POST(request: NextRequest) {
350523
},
351524
});
352525

526+
schedulePdfMarkdownAiCapture(captureEventPromise);
527+
353528
const encoder = new TextEncoder();
354529

355530
const stream = new ReadableStream<Uint8Array>({
@@ -362,6 +537,10 @@ export async function POST(request: NextRequest) {
362537
const streamedQuestions: PdfPaperQuestion[] = [];
363538

364539
for await (const question of result.elementStream) {
540+
if (firstQuestionAt === null) {
541+
firstQuestionAt = Date.now();
542+
}
543+
365544
streamedQuestions.push(question);
366545
enqueue({
367546
type: "partial",
@@ -377,23 +556,89 @@ export async function POST(request: NextRequest) {
377556
schemaVersion: "exam-questions-v1",
378557
questions,
379558
});
559+
const markdown = buildPdfPaperMarkdown(paper);
380560
enqueue({
381561
type: "done",
382562
paper,
383-
markdown: buildPdfPaperMarkdown(paper),
563+
markdown,
384564
model: modelId,
385565
});
566+
resolveCaptureEventOnce({
567+
distinctId,
568+
fileBytes: pdfBuffer.byteLength,
569+
fileName: parsedBody.fileName,
570+
fileUrl: fileUrl.href,
571+
finishReason: result.finishReason,
572+
httpStatus: 200,
573+
isError: false,
574+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
575+
markdown,
576+
modelId,
577+
provider,
578+
questionCount: paper.questions.length,
579+
response: result.response,
580+
sessionId: parsedBody.posthogSessionId ?? undefined,
581+
spanId,
582+
timeToFirstQuestionSeconds:
583+
firstQuestionAt === null
584+
? undefined
585+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
586+
traceId,
587+
usage: result.totalUsage,
588+
userPrompt,
589+
});
386590
} catch (error) {
591+
const errorMessage = getStreamErrorMessage(error, streamError);
387592
enqueue({
388593
type: "error",
389-
error: getStreamErrorMessage(error, streamError),
594+
error: errorMessage,
595+
});
596+
resolveCaptureEventOnce({
597+
distinctId,
598+
error: errorMessage,
599+
fileBytes: pdfBuffer.byteLength,
600+
fileName: parsedBody.fileName,
601+
fileUrl: fileUrl.href,
602+
httpStatus: request.signal.aborted ? 499 : 500,
603+
isError: true,
604+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
605+
modelId,
606+
provider,
607+
sessionId: parsedBody.posthogSessionId ?? undefined,
608+
spanId,
609+
timeToFirstQuestionSeconds:
610+
firstQuestionAt === null
611+
? undefined
612+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
613+
traceId,
614+
userPrompt,
390615
});
391616
} finally {
617+
resolveCaptureEventOnce(null);
392618
controller.close();
393619
}
394620
},
395621
cancel() {
396-
request.signal.throwIfAborted();
622+
resolveCaptureEventOnce({
623+
distinctId,
624+
error: "PDF Markdown conversion was cancelled.",
625+
fileBytes: pdfBuffer.byteLength,
626+
fileName: parsedBody.fileName,
627+
fileUrl: fileUrl.href,
628+
httpStatus: 499,
629+
isError: true,
630+
latencySeconds: Math.max(Date.now() - llmStartedAt, 0) / 1000,
631+
modelId,
632+
provider,
633+
sessionId: parsedBody.posthogSessionId ?? undefined,
634+
spanId,
635+
timeToFirstQuestionSeconds:
636+
firstQuestionAt === null
637+
? undefined
638+
: Math.max(firstQuestionAt - llmStartedAt, 0) / 1000,
639+
traceId,
640+
userPrompt,
641+
});
397642
},
398643
});
399644

app/components/past_papers/paper-split-view.tsx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,11 @@ function PaperSplitPanel({
256256
</header>
257257

258258
<div className="min-h-0 flex-1 overflow-hidden border border-black/15 bg-white shadow-[0_4px_28px_-14px_rgba(0,0,0,0.25)] dark:border-[#D5D5D5]/15 dark:bg-[#0C1222] dark:shadow-[0_4px_28px_-14px_rgba(0,0,0,0.6)]">
259-
<PDFViewerClient fileUrl={paper.fileUrl} fileName={paper.fileName} />
259+
<PDFViewerClient
260+
enableQuestionMarkdown
261+
fileUrl={paper.fileUrl}
262+
fileName={paper.fileName}
263+
/>
260264
</div>
261265
</aside>
262266
);

0 commit comments

Comments
 (0)