1- import { NextResponse , type NextRequest } from "next/server" ;
1+ import { after , NextResponse , type NextRequest } from "next/server" ;
22import { Output , streamText } from "ai" ;
33import { z } from "zod" ;
44import { auth } from "@/app/auth" ;
@@ -10,8 +10,14 @@ import {
1010 getPdfMarkdownModel ,
1111} from "@/lib/ai/pdf-markdown" ;
1212import type { PdfPaperQuestion } from "@/lib/ai/pdf-markdown" ;
13+ import {
14+ capturePostHogAiGeneration ,
15+ createAiTextMessage ,
16+ } from "@/lib/posthog/llm" ;
1317
1418const MAX_PDF_MARKDOWN_BYTES = 24 * 1024 * 1024 ;
19+ const PDF_MARKDOWN_MAX_OUTPUT_TOKENS = 12000 ;
20+ const POSTHOG_AI_TEXT_LIMIT = 8000 ;
1521const PDF_MARKDOWN_SYSTEM_PROMPT = [
1622 "You are a careful transcription engine for ExamCooker question-paper PDFs." ,
1723 "Extract only the exam questions. Ignore cover-page metadata, institution/course details, course code, course name, slot, registration fields, faculty names, course outcomes, page separators, general instructions, CO columns, and Bloom taxonomy columns." ,
@@ -33,13 +39,47 @@ const PDF_MARKDOWN_SYSTEM_PROMPT = [
3339const PdfMarkdownRequestSchema = z . object ( {
3440 fileName : z . string ( ) . trim ( ) . min ( 1 ) . max ( 240 ) ,
3541 fileUrl : z . string ( ) . trim ( ) . url ( ) ,
42+ posthogSessionId : z . string ( ) . trim ( ) . min ( 1 ) . max ( 200 ) . nullable ( ) . optional ( ) ,
3643} ) ;
3744
3845type AllowedPdfSource = {
3946 origin : string ;
4047 pathPrefix : string ;
4148} ;
4249
50+ type AiUsageSummary = {
51+ inputTokens ?: number ;
52+ outputTokens ?: number ;
53+ } ;
54+
55+ type AiResponseSummary = {
56+ id ?: string ;
57+ } ;
58+
59+ type PdfMarkdownAiCaptureEvent = {
60+ distinctId : string ;
61+ error ?: unknown ;
62+ fileBytes : number ;
63+ fileName : string ;
64+ fileUrl : string ;
65+ finishReason ?: PromiseLike < string | undefined > | null ;
66+ httpStatus : number ;
67+ isError : boolean ;
68+ latencySeconds : number ;
69+ markdown ?: string ;
70+ modelId : string ;
71+ outputWasTruncated ?: boolean ;
72+ provider : string ;
73+ questionCount ?: number ;
74+ response ?: PromiseLike < AiResponseSummary > | null ;
75+ sessionId ?: string ;
76+ spanId : string ;
77+ timeToFirstQuestionSeconds ?: number ;
78+ traceId : string ;
79+ usage ?: PromiseLike < AiUsageSummary > | null ;
80+ userPrompt : string ;
81+ } ;
82+
4383function getAzureBaseUrlFromEnv ( ) {
4484 const explicitBaseUrl = process . env . AZURE_BLOB_PUBLIC_BASE_URL ?. trim ( ) ;
4585 if ( explicitBaseUrl ) {
@@ -203,6 +243,107 @@ function getStreamErrorMessage(error: unknown, streamError: unknown) {
203243 return fallbackMessage || "Failed to convert this PDF to Markdown." ;
204244}
205245
246+ function getAiProviderFromModel ( modelId : string ) {
247+ const [ provider ] = modelId . split ( "/" ) ;
248+ return provider && provider !== modelId ? provider : "openai" ;
249+ }
250+
251+ function truncateForPostHogAiText ( text : string ) {
252+ if ( text . length <= POSTHOG_AI_TEXT_LIMIT ) {
253+ return {
254+ text,
255+ wasTruncated : false ,
256+ } ;
257+ }
258+
259+ return {
260+ text : text . slice ( 0 , POSTHOG_AI_TEXT_LIMIT ) ,
261+ wasTruncated : true ,
262+ } ;
263+ }
264+
265+ async function safeAwait < T > ( promise : PromiseLike < T > | null | undefined ) {
266+ if ( ! promise ) {
267+ return null ;
268+ }
269+
270+ try {
271+ return await promise ;
272+ } catch {
273+ return null ;
274+ }
275+ }
276+
277+ function schedulePdfMarkdownAiCapture (
278+ captureEventPromise : Promise < PdfMarkdownAiCaptureEvent | null > ,
279+ ) {
280+ after ( async ( ) => {
281+ const captureEvent = await captureEventPromise . catch ( ( ) => null ) ;
282+ if ( ! captureEvent ) {
283+ return ;
284+ }
285+
286+ const usage = await safeAwait ( captureEvent . usage ) ;
287+ const response = await safeAwait ( captureEvent . response ) ;
288+ const finishReason = await safeAwait ( captureEvent . finishReason ) ;
289+ const outputText = captureEvent . markdown
290+ ? truncateForPostHogAiText ( captureEvent . markdown )
291+ : null ;
292+
293+ await capturePostHogAiGeneration ( {
294+ distinctId : captureEvent . distinctId ,
295+ traceId : captureEvent . traceId ,
296+ sessionId : captureEvent . sessionId ,
297+ spanId : response ?. id ?? captureEvent . spanId ,
298+ spanName : "pdf_markdown_extraction" ,
299+ model : captureEvent . modelId ,
300+ provider : captureEvent . provider ,
301+ input : [
302+ createAiTextMessage ( "system" , PDF_MARKDOWN_SYSTEM_PROMPT ) ,
303+ {
304+ role : "user" ,
305+ content : [
306+ {
307+ type : "text" ,
308+ text : captureEvent . userPrompt ,
309+ } ,
310+ {
311+ type : "file" ,
312+ mediaType : "application/pdf" ,
313+ filename : getSafePdfFileName ( captureEvent . fileName ) ,
314+ bytes : captureEvent . fileBytes ,
315+ } ,
316+ ] ,
317+ } ,
318+ ] ,
319+ inputTokens : usage ?. inputTokens ,
320+ outputChoices : outputText
321+ ? [ createAiTextMessage ( "assistant" , outputText . text ) ]
322+ : undefined ,
323+ outputTokens : usage ?. outputTokens ,
324+ latencySeconds : captureEvent . latencySeconds ,
325+ httpStatus : captureEvent . httpStatus ,
326+ isError : captureEvent . isError ,
327+ error : captureEvent . error ,
328+ stopReason :
329+ finishReason ?? ( captureEvent . isError ? "error" : undefined ) ,
330+ stream : true ,
331+ maxTokens : PDF_MARKDOWN_MAX_OUTPUT_TOKENS ,
332+ extraProperties : {
333+ ai_surface : "pdf_markdown" ,
334+ pdf_markdown_file_bytes : captureEvent . fileBytes ,
335+ pdf_markdown_file_name : captureEvent . fileName ,
336+ pdf_markdown_file_url : captureEvent . fileUrl ,
337+ pdf_markdown_output_truncated :
338+ captureEvent . outputWasTruncated ?? outputText ?. wasTruncated ,
339+ pdf_markdown_question_count : captureEvent . questionCount ,
340+ pdf_markdown_time_to_first_question :
341+ captureEvent . timeToFirstQuestionSeconds ,
342+ } ,
343+ } ) ;
344+ } ) ;
345+ }
346+
206347async function fetchPdfBuffer ( fileUrl : URL ) {
207348 const response = await fetch ( fileUrl , {
208349 cache : "no-store" ,
@@ -237,7 +378,8 @@ async function fetchPdfBuffer(fileUrl: URL) {
237378
238379export async function POST ( request : NextRequest ) {
239380 const session = await auth ( ) ;
240- if ( ! session ?. user ?. email ) {
381+ const distinctId = session ?. user ?. id ?? session ?. user ?. email ?? null ;
382+ if ( ! distinctId ) {
241383 return NextResponse . json (
242384 {
243385 error : "You must be signed in to convert PDFs to Markdown." ,
@@ -309,9 +451,36 @@ export async function POST(request: NextRequest) {
309451
310452 const model = getPdfMarkdownLanguageModel ( ) ;
311453 const modelId = getPdfMarkdownModel ( ) ;
454+ const provider = getAiProviderFromModel ( modelId ) ;
455+ const traceId = crypto . randomUUID ( ) ;
456+ const spanId = crypto . randomUUID ( ) ;
457+ const userPrompt =
458+ `Extract only the questions from ${ parsedBody . fileName } . ` +
459+ "Return no metadata and no instructions." ;
312460
313461 try {
314462 let streamError : unknown = null ;
463+ const llmStartedAt = Date . now ( ) ;
464+ let firstQuestionAt : number | null = null ;
465+ let resolveCaptureEvent :
466+ | ( ( event : PdfMarkdownAiCaptureEvent | null ) => void )
467+ | null = null ;
468+ const captureEventPromise = new Promise < PdfMarkdownAiCaptureEvent | null > (
469+ ( resolve ) => {
470+ resolveCaptureEvent = resolve ;
471+ } ,
472+ ) ;
473+ const resolveCaptureEventOnce = (
474+ event : PdfMarkdownAiCaptureEvent | null ,
475+ ) => {
476+ if ( ! resolveCaptureEvent ) {
477+ return ;
478+ }
479+
480+ resolveCaptureEvent ( event ) ;
481+ resolveCaptureEvent = null ;
482+ } ;
483+
315484 const result = streamText ( {
316485 model,
317486 system : PDF_MARKDOWN_SYSTEM_PROMPT ,
@@ -321,9 +490,7 @@ export async function POST(request: NextRequest) {
321490 content : [
322491 {
323492 type : "text" ,
324- text :
325- `Extract only the questions from ${ parsedBody . fileName } . ` +
326- "Return no metadata and no instructions." ,
493+ text : userPrompt ,
327494 } ,
328495 {
329496 type : "file" ,
@@ -341,7 +508,7 @@ export async function POST(request: NextRequest) {
341508 "A faithful ordered list of only question numbers, question text, and marks." ,
342509 } ) ,
343510 abortSignal : request . signal ,
344- maxOutputTokens : 12000 ,
511+ maxOutputTokens : PDF_MARKDOWN_MAX_OUTPUT_TOKENS ,
345512 experimental_include : {
346513 requestBody : false ,
347514 } ,
@@ -356,6 +523,8 @@ export async function POST(request: NextRequest) {
356523 } ,
357524 } ) ;
358525
526+ schedulePdfMarkdownAiCapture ( captureEventPromise ) ;
527+
359528 const encoder = new TextEncoder ( ) ;
360529
361530 const stream = new ReadableStream < Uint8Array > ( {
@@ -368,6 +537,10 @@ export async function POST(request: NextRequest) {
368537 const streamedQuestions : PdfPaperQuestion [ ] = [ ] ;
369538
370539 for await ( const question of result . elementStream ) {
540+ if ( firstQuestionAt === null ) {
541+ firstQuestionAt = Date . now ( ) ;
542+ }
543+
371544 streamedQuestions . push ( question ) ;
372545 enqueue ( {
373546 type : "partial" ,
@@ -383,23 +556,89 @@ export async function POST(request: NextRequest) {
383556 schemaVersion : "exam-questions-v1" ,
384557 questions,
385558 } ) ;
559+ const markdown = buildPdfPaperMarkdown ( paper ) ;
386560 enqueue ( {
387561 type : "done" ,
388562 paper,
389- markdown : buildPdfPaperMarkdown ( paper ) ,
563+ markdown,
390564 model : modelId ,
391565 } ) ;
566+ resolveCaptureEventOnce ( {
567+ distinctId,
568+ fileBytes : pdfBuffer . byteLength ,
569+ fileName : parsedBody . fileName ,
570+ fileUrl : fileUrl . href ,
571+ finishReason : result . finishReason ,
572+ httpStatus : 200 ,
573+ isError : false ,
574+ latencySeconds : Math . max ( Date . now ( ) - llmStartedAt , 0 ) / 1000 ,
575+ markdown,
576+ modelId,
577+ provider,
578+ questionCount : paper . questions . length ,
579+ response : result . response ,
580+ sessionId : parsedBody . posthogSessionId ?? undefined ,
581+ spanId,
582+ timeToFirstQuestionSeconds :
583+ firstQuestionAt === null
584+ ? undefined
585+ : Math . max ( firstQuestionAt - llmStartedAt , 0 ) / 1000 ,
586+ traceId,
587+ usage : result . totalUsage ,
588+ userPrompt,
589+ } ) ;
392590 } catch ( error ) {
591+ const errorMessage = getStreamErrorMessage ( error , streamError ) ;
393592 enqueue ( {
394593 type : "error" ,
395- error : getStreamErrorMessage ( error , streamError ) ,
594+ error : errorMessage ,
595+ } ) ;
596+ resolveCaptureEventOnce ( {
597+ distinctId,
598+ error : errorMessage ,
599+ fileBytes : pdfBuffer . byteLength ,
600+ fileName : parsedBody . fileName ,
601+ fileUrl : fileUrl . href ,
602+ httpStatus : request . signal . aborted ? 499 : 500 ,
603+ isError : true ,
604+ latencySeconds : Math . max ( Date . now ( ) - llmStartedAt , 0 ) / 1000 ,
605+ modelId,
606+ provider,
607+ sessionId : parsedBody . posthogSessionId ?? undefined ,
608+ spanId,
609+ timeToFirstQuestionSeconds :
610+ firstQuestionAt === null
611+ ? undefined
612+ : Math . max ( firstQuestionAt - llmStartedAt , 0 ) / 1000 ,
613+ traceId,
614+ userPrompt,
396615 } ) ;
397616 } finally {
617+ resolveCaptureEventOnce ( null ) ;
398618 controller . close ( ) ;
399619 }
400620 } ,
401621 cancel ( ) {
402- request . signal . throwIfAborted ( ) ;
622+ resolveCaptureEventOnce ( {
623+ distinctId,
624+ error : "PDF Markdown conversion was cancelled." ,
625+ fileBytes : pdfBuffer . byteLength ,
626+ fileName : parsedBody . fileName ,
627+ fileUrl : fileUrl . href ,
628+ httpStatus : 499 ,
629+ isError : true ,
630+ latencySeconds : Math . max ( Date . now ( ) - llmStartedAt , 0 ) / 1000 ,
631+ modelId,
632+ provider,
633+ sessionId : parsedBody . posthogSessionId ?? undefined ,
634+ spanId,
635+ timeToFirstQuestionSeconds :
636+ firstQuestionAt === null
637+ ? undefined
638+ : Math . max ( firstQuestionAt - llmStartedAt , 0 ) / 1000 ,
639+ traceId,
640+ userPrompt,
641+ } ) ;
403642 } ,
404643 } ) ;
405644
0 commit comments