Skip to content

Commit a1e2a2f

Browse files
mike-inkeepgithub-actions[bot]amikofalvyclaude
authored
feat(pdf): Support PDF attachments (#2709)
* feat(pdf): Support PDF attachments * Add tests and other review feedback * Fix doc * More renaming and cleanup * refactor: extract Vercel content part schemas to types/chat.ts for reuse Move inline Zod schemas from chatDataStream.ts and message-parts.ts into types/chat.ts as shared, exported schemas. This eliminates duplicate definitions and makes schema management easier. Co-authored-by: Andrew Mikofalvy <amikofalvy@users.noreply.github.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Andrew Mikofalvy <amikofalvy@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent eb3a4bb commit a1e2a2f

30 files changed

+1020
-535
lines changed

agents-api/__snapshots__/openapi.json

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39186,6 +39186,33 @@
3918639186
],
3918739187
"type": "object"
3918839188
},
39189+
{
39190+
"properties": {
39191+
"filename": {
39192+
"type": "string"
39193+
},
39194+
"mediaType": {
39195+
"type": "string"
39196+
},
39197+
"mimeType": {
39198+
"type": "string"
39199+
},
39200+
"text": {
39201+
"type": "string"
39202+
},
39203+
"type": {
39204+
"enum": [
39205+
"file"
39206+
],
39207+
"type": "string"
39208+
}
39209+
},
39210+
"required": [
39211+
"type",
39212+
"text"
39213+
],
39214+
"type": "object"
39215+
},
3918939216
{
3919039217
"properties": {
3919139218
"text": {
@@ -39196,8 +39223,7 @@
3919639223
{
3919739224
"enum": [
3919839225
"audio",
39199-
"video",
39200-
"file"
39226+
"video"
3920139227
],
3920239228
"type": "string"
3920339229
},
@@ -39851,6 +39877,36 @@
3985139877
"image_url"
3985239878
],
3985339879
"type": "object"
39880+
},
39881+
{
39882+
"properties": {
39883+
"file": {
39884+
"properties": {
39885+
"file_data": {
39886+
"pattern": "^data:application\\/pdf;base64,",
39887+
"type": "string"
39888+
},
39889+
"filename": {
39890+
"type": "string"
39891+
}
39892+
},
39893+
"required": [
39894+
"file_data"
39895+
],
39896+
"type": "object"
39897+
},
39898+
"type": {
39899+
"enum": [
39900+
"file"
39901+
],
39902+
"type": "string"
39903+
}
39904+
},
39905+
"required": [
39906+
"type",
39907+
"file"
39908+
],
39909+
"type": "object"
3985439910
}
3985539911
]
3985639912
},

agents-api/src/__tests__/run/agents/Agent.test.ts

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ vi.mock('../../../domains/run/session/AgentSession.js', () => ({
262262
},
263263
}));
264264

265-
vi.mock('../../../domains/run/services/blob-storage/image-upload-helpers', () => ({
265+
vi.mock('../../../domains/run/services/blob-storage/file-upload-helpers', () => ({
266266
buildPersistedMessageContent: buildPersistedMessageContentMock,
267267
}));
268268

@@ -1512,6 +1512,40 @@ describe('Agent Image Support', () => {
15121512
})
15131513
);
15141514
});
1515+
1516+
test('passes inline PDF data to generateText as file content', async () => {
1517+
const agent = new Agent(mockAgentConfig, mockExecutionContext);
1518+
const pdfBytes = Buffer.from(
1519+
'%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n',
1520+
'utf8'
1521+
).toString('base64');
1522+
1523+
await agent.generate([
1524+
{ kind: 'text', text: 'Summarize this PDF' },
1525+
{
1526+
kind: 'file',
1527+
file: {
1528+
bytes: pdfBytes,
1529+
mimeType: 'application/pdf',
1530+
},
1531+
},
1532+
]);
1533+
1534+
const { generateText } = await import('ai');
1535+
expect(generateText).toHaveBeenCalledWith(
1536+
expect.objectContaining({
1537+
messages: expect.arrayContaining([
1538+
expect.objectContaining({
1539+
role: 'user',
1540+
content: expect.arrayContaining([
1541+
expect.objectContaining({ type: 'text' }),
1542+
expect.objectContaining({ type: 'file', mediaType: 'application/pdf' }),
1543+
]),
1544+
}),
1545+
]),
1546+
})
1547+
);
1548+
});
15151549
});
15161550

15171551
describe('Agent tool result persistence', () => {

agents-api/src/__tests__/run/routes/chat.test.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,34 @@ describe('Chat Routes', () => {
258258
expect(response.headers.get('content-type')).toBe('text/event-stream');
259259
});
260260

261+
it('should accept inline PDF content item in OpenAI-style messages', async () => {
262+
const response = await makeRequest('/run/v1/chat/completions', {
263+
method: 'POST',
264+
body: JSON.stringify({
265+
model: 'claude-3-sonnet',
266+
messages: [
267+
{
268+
role: 'user',
269+
content: [
270+
{ type: 'text', text: 'Summarize this document' },
271+
{
272+
type: 'file',
273+
file: {
274+
file_data: 'data:application/pdf;base64,JVBERi0xLjQK',
275+
filename: 'document.pdf',
276+
},
277+
},
278+
],
279+
},
280+
],
281+
conversationId: 'conv-123',
282+
}),
283+
});
284+
285+
expect(response.status).toBe(200);
286+
expect(response.headers.get('content-type')).toBe('text/event-stream');
287+
});
288+
261289
it('should handle conversation creation', async () => {
262290
const response = await makeRequest('/run/v1/chat/completions', {
263291
method: 'POST',

agents-api/src/__tests__/run/routes/chat/dataChat.test.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,34 @@ describe('Chat Data Stream Route', () => {
193193
expect(text).toMatch(/response/);
194194
});
195195

196+
it('should accept inline PDF file part in Vercel messages format', async () => {
197+
const body = {
198+
messages: [
199+
{
200+
role: 'user',
201+
content: 'Summarize this PDF',
202+
parts: [
203+
{ type: 'text', text: 'Summarize this PDF' },
204+
{
205+
type: 'file',
206+
text: 'data:application/pdf;base64,JVBERi0xLjQK',
207+
mediaType: 'application/pdf',
208+
filename: 'doc.pdf',
209+
},
210+
],
211+
},
212+
],
213+
};
214+
215+
const res = await makeRequest('/run/api/chat', {
216+
method: 'POST',
217+
body: JSON.stringify(body),
218+
});
219+
220+
expect(res.status).toBe(200);
221+
expect(res.headers.get('x-vercel-ai-data-stream')).toBe('v2');
222+
});
223+
196224
it('should stream approval UI events published to ToolApprovalUiBus (simulating delegated agent approval)', async () => {
197225
// Ensure deterministic requestId inside route subscription (chatds-${Date.now()})
198226
const nowSpy = vi.spyOn(Date, 'now').mockReturnValue(12345);

agents-api/src/domains/run/agents/agent-types.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,14 @@ export type AiSdkImagePart = {
3333
experimental_providerMetadata?: { openai?: { imageDetail?: ImageDetail } };
3434
};
3535

36-
export type AiSdkContentPart = AiSdkTextPart | AiSdkImagePart;
36+
export type AiSdkFilePart = {
37+
type: 'file';
38+
data: string | URL;
39+
mediaType: string;
40+
filename?: string;
41+
};
42+
43+
export type AiSdkContentPart = AiSdkTextPart | AiSdkImagePart | AiSdkFilePart;
3744

3845
/**
3946
* Creates a stopWhen condition that stops when any tool call name starts with the given prefix

agents-api/src/domains/run/agents/generation/conversation-history.ts

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import type { FilePart } from '@inkeep/agents-core';
2+
import { normalizeMimeType } from '@inkeep/agents-core/constants/allowed-file-formats';
3+
import { getLogger } from '../../../../logger';
24
import {
35
createDefaultConversationHistoryConfig,
46
getConversationHistoryWithCompression,
@@ -11,6 +13,36 @@ import {
1113
import type { AgentRunContext, AiSdkContentPart } from '../agent-types';
1214
import { getPrimaryModel, getSummarizerModel } from './model-config';
1315

16+
const PDF_MEDIA_TYPE = 'application/pdf';
17+
const logger = getLogger('conversation-history');
18+
19+
function mapFileToAiSdkContentPart(
20+
fileValue: string | URL,
21+
mimeType: string,
22+
metadata: { detail?: string; filename?: string } | undefined
23+
): AiSdkContentPart | null {
24+
if (mimeType.startsWith('image/')) {
25+
return {
26+
type: 'image',
27+
image: fileValue,
28+
...(metadata?.detail && {
29+
experimental_providerMetadata: { openai: { imageDetail: metadata.detail as any } },
30+
}),
31+
};
32+
}
33+
34+
if (mimeType === PDF_MEDIA_TYPE) {
35+
return {
36+
type: 'file',
37+
data: fileValue,
38+
mediaType: PDF_MEDIA_TYPE,
39+
...(metadata?.filename ? { filename: metadata.filename } : {}),
40+
};
41+
}
42+
43+
return null;
44+
}
45+
1446
export async function buildConversationHistory(
1547
ctx: AgentRunContext,
1648
contextId: string,
@@ -81,7 +113,7 @@ export function buildInitialMessages(
81113
systemPrompt: string,
82114
conversationHistory: string,
83115
userMessage: string,
84-
imageParts?: FilePart[]
116+
fileParts?: FilePart[]
85117
): any[] {
86118
const messages: any[] = [];
87119
messages.push({ role: 'system', content: systemPrompt });
@@ -90,7 +122,7 @@ export function buildInitialMessages(
90122
messages.push({ role: 'user', content: conversationHistory });
91123
}
92124

93-
const userContent = buildUserMessageContent(userMessage, imageParts);
125+
const userContent = buildUserMessageContent(userMessage, fileParts);
94126
messages.push({
95127
role: 'user',
96128
content: userContent,
@@ -101,30 +133,31 @@ export function buildInitialMessages(
101133

102134
export function buildUserMessageContent(
103135
text: string,
104-
imageParts?: FilePart[]
136+
fileParts?: FilePart[]
105137
): string | AiSdkContentPart[] {
106-
if (!imageParts || imageParts.length === 0) {
138+
if (!fileParts || fileParts.length === 0) {
107139
return text;
108140
}
109141

110142
const content: AiSdkContentPart[] = [{ type: 'text', text }];
111143

112-
for (const part of imageParts) {
144+
for (const part of fileParts) {
113145
const file = part.file;
114-
const imageValue =
146+
const fileValue =
115147
'uri' in file && file.uri
116148
? new URL(file.uri)
117-
: `data:${file.mimeType || 'image/*'};base64,${file.bytes}`;
118-
119-
const imagePart: AiSdkContentPart = {
120-
type: 'image',
121-
image: imageValue,
122-
...(part.metadata?.detail && {
123-
experimental_providerMetadata: { openai: { imageDetail: part.metadata.detail } },
124-
}),
125-
};
126-
127-
content.push(imagePart);
149+
: `data:${file.mimeType || ''};base64,${file.bytes}`;
150+
const mimeType = normalizeMimeType(file.mimeType ?? '');
151+
const mappedPart = mapFileToAiSdkContentPart(fileValue, mimeType, {
152+
detail: typeof part.metadata?.detail === 'string' ? part.metadata.detail : undefined,
153+
filename: typeof part.metadata?.filename === 'string' ? part.metadata.filename : undefined,
154+
});
155+
156+
if (mappedPart) {
157+
content.push(mappedPart);
158+
} else {
159+
logger.warn({ mimeType, source: 'user-message' }, 'Dropping unsupported file content part');
160+
}
128161
}
129162

130163
return content;

agents-api/src/domains/run/agents/generation/generate.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,7 @@ export async function runGenerate(
149149
.join('')
150150
: '';
151151
const userMessage = `${textParts}${dataContext}`;
152-
const imageParts = userParts.filter(
153-
(part): part is FilePart =>
154-
part.kind === 'file' && part.file.mimeType?.startsWith('image/') === true
155-
);
152+
const fileParts = userParts.filter((part): part is FilePart => part.kind === 'file');
156153
const conversationIdForSpan = runtimeContext?.metadata?.conversationId;
157154

158155
return tracer.startActiveSpan(
@@ -200,14 +197,21 @@ export async function runGenerate(
200197

201198
const { primaryModelSettings, modelSettings, hasStructuredOutput, timeoutMs } =
202199
configureModelSettings(ctx);
200+
const inlinePdfFileCount = fileParts.filter(
201+
(part) => part.file.mimeType?.toLowerCase().startsWith('application/pdf') === true
202+
).length;
203+
span.setAttributes({
204+
'input.file_count': fileParts.length,
205+
'input.pdf_file_count': inlinePdfFileCount,
206+
});
203207
let response: ResolvedGenerationResponse;
204208
let textResponse: string;
205209

206210
const messages = buildInitialMessages(
207211
systemPrompt,
208212
conversationHistory,
209213
userMessage,
210-
imageParts
214+
fileParts
211215
);
212216

213217
const { originalMessageCount, compressor } = setupCompression(

agents-api/src/domains/run/agents/generation/tool-result-for-conversation-history.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import type { MessageContent, Part } from '@inkeep/agents-core';
2-
import { makeMessageContentParts } from '../../services/blob-storage/image-upload';
3-
import { buildPersistedMessageContent } from '../../services/blob-storage/image-upload-helpers';
2+
import { makeMessageContentParts } from '../../services/blob-storage/file-upload';
3+
import { buildPersistedMessageContent } from '../../services/blob-storage/file-upload-helpers';
44
import { isToolResultDenied } from '../../utils/tool-result';
55
import type { AgentRunContext } from '../agent-types';
66

agents-api/src/domains/run/routes/chat.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ import { flushBatchProcessor } from '../../../instrumentation';
2020
import { getLogger } from '../../../logger';
2121
import { contextValidationMiddleware, handleContextResolution } from '../context';
2222
import { ExecutionHandler } from '../handlers/executionHandler';
23-
import { buildPersistedMessageContent } from '../services/blob-storage/image-upload-helpers';
23+
import { buildPersistedMessageContent } from '../services/blob-storage/file-upload-helpers';
2424
import { toolApprovalUiBus } from '../session/ToolApprovalUiBus';
2525
import { createSSEStreamHelper } from '../stream/stream-helpers';
2626
import type { Message } from '../types/chat';
27-
import { ImageContentItemSchema } from '../types/chat';
27+
import { FileContentItemSchema, ImageContentItemSchema } from '../types/chat';
2828
import { errorOp } from '../utils/agent-operations';
2929
import { extractTextFromParts, getMessagePartsFromOpenAIContent } from '../utils/message-parts';
3030

@@ -68,6 +68,7 @@ const chatCompletionsRoute = createProtectedRoute({
6868
text: z.string(),
6969
}),
7070
ImageContentItemSchema,
71+
FileContentItemSchema,
7172
])
7273
),
7374
])

0 commit comments

Comments
 (0)