diff --git a/apps/web/src/app/api/ai/chat/route.ts b/apps/web/src/app/api/ai/chat/route.ts index c6becc4fd..fe5506de2 100644 --- a/apps/web/src/app/api/ai/chat/route.ts +++ b/apps/web/src/app/api/ai/chat/route.ts @@ -856,6 +856,27 @@ export async function POST(request: Request) { }); } + // Guard: if truncation left zero messages, the system prompt + tools alone exceed the budget. + // Sending an empty conversation to the model would produce a meaningless response or error. + if (includedMessages.length === 0) { + loggers.ai.error('AI Chat API: Context budget exhausted by system prompt and tools alone', { + model: currentModel, + provider: currentProvider, + contextWindow, + inputBudget, + systemPromptTokens, + toolTokens, + }); + return NextResponse.json( + { + error: 'context_length_exceeded', + message: 'The system configuration (prompts and tools) exceeds this model\'s context window. Please switch to a model with a larger context window.', + details: 'context_length_exceeded', + }, + { status: 413 } + ); + } + const modelMessages = convertToModelMessages(includedMessages as UIMessage[], { tools: filteredTools // Use original tools - no wrapping needed }); @@ -1248,7 +1269,7 @@ export async function POST(request: Request) { error: 'context_length_exceeded', message: wasTruncated ? 'The conversation still exceeds this model\'s context window even after trimming. Please start a new conversation.' - : 'The conversation is too long for this model\'s context window. Older messages have been trimmed — try sending your message again.', + : 'The conversation is too long for this model\'s context window. Please start a new conversation or switch to a model with a larger context window.', details: 'context_length_exceeded', }, { status: 413 } diff --git a/apps/web/src/lib/ai/shared/__tests__/error-messages.test.ts b/apps/web/src/lib/ai/shared/__tests__/error-messages.test.ts new file mode 100644 index 000000000..fd00dff04 --- /dev/null +++ b/apps/web/src/lib/ai/shared/__tests__/error-messages.test.ts @@ -0,0 +1,117 @@ +import { describe, it, expect } from 'vitest'; +import { + getAIErrorMessage, + isContextLengthError, + isRateLimitError, + isAuthenticationError, +} from '../error-messages'; + +describe('isContextLengthError', () => { + it('returns false for undefined/empty input', () => { + expect(isContextLengthError(undefined)).toBe(false); + expect(isContextLengthError('')).toBe(false); + }); + + it('detects context_length_exceeded API key', () => { + expect(isContextLengthError('context_length_exceeded')).toBe(true); + expect(isContextLengthError('Error: context_length_exceeded for model gpt-4o')).toBe(true); + }); + + it('detects human-readable context length variants', () => { + expect(isContextLengthError('context length exceeded')).toBe(true); + expect(isContextLengthError('exceeds the context window')).toBe(true); + expect(isContextLengthError('maximum context length is 128000 tokens')).toBe(true); + }); + + it('detects token limit errors', () => { + expect(isContextLengthError('token limit exceeded')).toBe(true); + expect(isContextLengthError('number of tokens exceeds the maximum')).toBe(true); + expect(isContextLengthError('too many tokens in the request')).toBe(true); + }); + + it('detects provider-specific "maximum tokens" phrasing', () => { + expect(isContextLengthError('maximum number of tokens allowed is 200000')).toBe(true); + }); + + it('detects HTTP 413 in status-code patterns only', () => { + expect(isContextLengthError('HTTP 413')).toBe(true); + expect(isContextLengthError('status 413')).toBe(true); + expect(isContextLengthError('error 413: payload too large')).toBe(true); + expect(isContextLengthError('code 413')).toBe(true); + }); + + it('does NOT false-positive on bare "413" in other contexts', () => { + expect(isContextLengthError('processed 413 items successfully')).toBe(false); + expect(isContextLengthError('user ID 4130 not found')).toBe(false); + expect(isContextLengthError('port 4135 is in use')).toBe(false); + }); + + it('does NOT match unrelated error messages', () => { + expect(isContextLengthError('rate limit exceeded')).toBe(false); + expect(isContextLengthError('Unauthorized')).toBe(false); + expect(isContextLengthError('Internal server error')).toBe(false); + expect(isContextLengthError('Provider returned error')).toBe(false); + }); +}); + +describe('isRateLimitError', () => { + it('returns false for undefined/empty input', () => { + expect(isRateLimitError(undefined)).toBe(false); + expect(isRateLimitError('')).toBe(false); + }); + + it('detects rate limit errors', () => { + expect(isRateLimitError('rate limit exceeded')).toBe(true); + expect(isRateLimitError('429 Too Many Requests')).toBe(true); + expect(isRateLimitError('402 Payment Required')).toBe(true); + expect(isRateLimitError('Failed after 3 retries')).toBe(true); + expect(isRateLimitError('Provider returned error')).toBe(true); + }); + + it('excludes context-length errors that contain "limit"', () => { + expect(isRateLimitError('token limit exceeded')).toBe(false); + expect(isRateLimitError('context_length_exceeded')).toBe(false); + expect(isRateLimitError('maximum context length limit')).toBe(false); + }); +}); + +describe('isAuthenticationError', () => { + it('detects auth errors', () => { + expect(isAuthenticationError('Unauthorized')).toBe(true); + expect(isAuthenticationError('401 Unauthorized')).toBe(true); + }); + + it('returns false for non-auth errors', () => { + expect(isAuthenticationError(undefined)).toBe(false); + expect(isAuthenticationError('rate limit exceeded')).toBe(false); + }); +}); + +describe('getAIErrorMessage', () => { + it('returns generic message for undefined input', () => { + expect(getAIErrorMessage(undefined)).toBe('Something went wrong. Please try again.'); + }); + + it('returns auth message for Unauthorized errors', () => { + expect(getAIErrorMessage('Unauthorized')).toBe( + 'Authentication failed. Please refresh the page and try again.' + ); + }); + + it('returns context-length message for context errors', () => { + const msg = getAIErrorMessage('context_length_exceeded'); + expect(msg).toContain('context window'); + expect(msg).not.toContain('trimmed'); + }); + + it('returns rate-limit message for rate errors', () => { + const msg = getAIErrorMessage('429 Too Many Requests'); + expect(msg).toContain('rate limit'); + }); + + it('returns generic message for unknown errors', () => { + expect(getAIErrorMessage('some random error')).toBe( + 'Something went wrong. Please try again.' + ); + }); +}); diff --git a/apps/web/src/lib/ai/shared/error-messages.ts b/apps/web/src/lib/ai/shared/error-messages.ts index 4d78199a7..c375e578e 100644 --- a/apps/web/src/lib/ai/shared/error-messages.ts +++ b/apps/web/src/lib/ai/shared/error-messages.ts @@ -15,7 +15,7 @@ export function getAIErrorMessage(errorMessage: string | undefined): string { // Context length errors if (isContextLengthError(errorMessage)) { - return 'The conversation is too long for this model\'s context window. Older messages have been trimmed to fit — try sending your message again.'; + return 'The conversation is too long for this model\'s context window. Please start a new conversation or switch to a model with a larger context window.'; } // Rate limit errors @@ -55,7 +55,8 @@ export function isContextLengthError(errorMessage: string | undefined): boolean msg.includes('token limit') || msg.includes('tokens exceeds') || msg.includes('too many tokens') || - errorMessage.includes('413') || + // Match HTTP 413 only in status-code patterns (e.g. "status 413", "HTTP 413", "code 413") + /\b(?:status|http|code|error)\s*413\b/i.test(errorMessage) || // OpenRouter / provider-specific phrasing (msg.includes('maximum') && msg.includes('tokens')) ); diff --git a/packages/lib/src/monitoring/__tests__/ai-context-calculator.test.ts b/packages/lib/src/monitoring/__tests__/ai-context-calculator.test.ts new file mode 100644 index 000000000..fadf6423c --- /dev/null +++ b/packages/lib/src/monitoring/__tests__/ai-context-calculator.test.ts @@ -0,0 +1,208 @@ +import { describe, it, expect } from 'vitest'; +import { + estimateTokens, + estimateMessageTokens, + estimateSystemPromptTokens, + estimateToolDefinitionTokens, + getContextWindowSize, + determineMessagesToInclude, + type UIMessage, +} from '../ai-context-calculator'; + +describe('estimateTokens', () => { + it('returns 0 for empty/falsy input', () => { + expect(estimateTokens('')).toBe(0); + expect(estimateTokens(null as unknown as string)).toBe(0); + expect(estimateTokens(undefined as unknown as string)).toBe(0); + }); + + it('estimates ~4 chars/token for ASCII text', () => { + const text = 'Hello, world! This is a test.'; + const tokens = estimateTokens(text); + // 28 chars / 4 = 7 + expect(tokens).toBe(Math.ceil(text.length / 4)); + }); + + it('uses ~2 chars/token for CJK-heavy text (>20% non-ASCII)', () => { + const text = '这是一个测试消息'; // 8 CJK characters, 100% non-ASCII + const tokens = estimateTokens(text); + expect(tokens).toBe(Math.ceil(text.length / 2)); + }); + + it('uses 4 chars/token for mostly-ASCII text with minor non-ASCII', () => { + // 80 ASCII chars + 5 non-ASCII = 5/85 ≈ 5.9% non-ASCII → use 4 chars/token + const text = 'a'.repeat(80) + '你好世界呢'; + const tokens = estimateTokens(text); + expect(tokens).toBe(Math.ceil(text.length / 4)); + }); + + it('switches to 2 chars/token at >20% non-ASCII threshold', () => { + // 3 ASCII + 1 CJK = 25% non-ASCII (above 20% threshold) + const text = 'abc你'; + const tokens = estimateTokens(text); + expect(tokens).toBe(Math.ceil(text.length / 2)); + }); + + it('handles emoji as non-ASCII', () => { + // Emoji are multi-byte, charCodeAt > 127 for surrogate pairs + const text = 'Hello 🌍🌍🌍🌍🌍'; // mixed with emoji + const tokens = estimateTokens(text); + expect(tokens).toBeGreaterThan(0); + }); +}); + +describe('estimateMessageTokens', () => { + it('returns overhead tokens for empty message', () => { + const msg: UIMessage = { role: 'user', parts: [] }; + // 5 (role) + 10 (overhead) = 15 + expect(estimateMessageTokens(msg)).toBe(15); + }); + + it('counts text parts', () => { + const msg: UIMessage = { + role: 'user', + parts: [{ type: 'text', text: 'Hello, world!' }], + }; + const tokens = estimateMessageTokens(msg); + // 5 (role) + estimateTokens('Hello, world!') + 10 (overhead) + expect(tokens).toBe(5 + estimateTokens('Hello, world!') + 10); + }); + + it('counts tool invocation parts (tool-{name} format)', () => { + const msg: UIMessage = { + role: 'assistant', + parts: [ + { + type: 'tool-search', + toolCallId: 'call_123', + toolName: 'search', + input: { query: 'test' }, + output: { results: ['a', 'b'] }, + state: 'output-available', + }, + ], + }; + const tokens = estimateMessageTokens(msg); + // Should include: 5 (role) + 10 (tool overhead) + toolName + input + output + 10 (msg overhead) + expect(tokens).toBeGreaterThan(25); + }); + + it('handles legacy tool-call/tool-result format via startsWith("tool-")', () => { + const msg: UIMessage = { + role: 'assistant', + parts: [ + { + type: 'tool-myTool', + toolCallId: 'call_456', + toolName: 'myTool', + args: { param: 'value' }, + result: 'done', + }, + ], + }; + const tokens = estimateMessageTokens(msg); + // args falls through to input ?? args path + expect(tokens).toBeGreaterThan(15); + }); + + it('handles message with no parts', () => { + const msg: UIMessage = { role: 'user' }; + expect(estimateMessageTokens(msg)).toBe(15); + }); +}); + +describe('estimateSystemPromptTokens', () => { + it('returns 0 for empty prompt', () => { + expect(estimateSystemPromptTokens(undefined)).toBe(0); + expect(estimateSystemPromptTokens('')).toBe(0); + }); + + it('estimates tokens for a prompt', () => { + const prompt = 'You are a helpful assistant.'; + expect(estimateSystemPromptTokens(prompt)).toBe(estimateTokens(prompt)); + }); +}); + +describe('estimateToolDefinitionTokens', () => { + it('returns 0 for empty tools', () => { + expect(estimateToolDefinitionTokens(undefined)).toBe(0); + expect(estimateToolDefinitionTokens({})).toBe(0); + }); + + it('estimates tokens from JSON serialization', () => { + const tools = { search: { description: 'Search the web', parameters: { query: 'string' } } }; + const tokens = estimateToolDefinitionTokens(tools); + expect(tokens).toBeGreaterThan(0); + expect(tokens).toBe(estimateTokens(JSON.stringify(tools))); + }); +}); + +describe('getContextWindowSize', () => { + it('returns correct size for known direct models', () => { + expect(getContextWindowSize('gpt-4o', 'openai')).toBe(128_000); + expect(getContextWindowSize('gpt-4', 'openai')).toBe(8_192); + }); + + it('returns correct size for Anthropic models', () => { + const size = getContextWindowSize('claude-3-5-sonnet-20241022', 'anthropic'); + expect(size).toBe(200_000); + }); + + it('returns correct size for Google models', () => { + expect(getContextWindowSize('gemini-2.5-pro', 'google')).toBe(2_000_000); + expect(getContextWindowSize('gemini-2.5-flash', 'google')).toBe(1_000_000); + }); + + it('returns conservative default for unknown models', () => { + expect(getContextWindowSize('unknown-model', 'unknown-provider')).toBe(200_000); + }); + + it('handles OpenRouter models', () => { + const size = getContextWindowSize('anthropic/claude-3.5-sonnet', 'openrouter'); + expect(size).toBe(200_000); + }); +}); + +describe('determineMessagesToInclude', () => { + const makeMsg = (text: string, role: 'user' | 'assistant' = 'user'): UIMessage => ({ + id: text, + role, + parts: [{ type: 'text', text }], + }); + + it('includes all messages when they fit in budget', () => { + const messages = [makeMsg('Hello'), makeMsg('World')]; + const result = determineMessagesToInclude(messages, 10_000, 100, 100); + expect(result.includedMessages).toHaveLength(2); + expect(result.wasTruncated).toBe(false); + }); + + it('truncates oldest messages first when budget is tight', () => { + const messages = [ + makeMsg('a'.repeat(1000)), // ~250 tokens + overhead + makeMsg('b'.repeat(1000)), // ~250 tokens + overhead + makeMsg('c'.repeat(100)), // ~25 tokens + overhead + ]; + // Budget of 100 tokens for messages (after system/tool subtracted) + const result = determineMessagesToInclude(messages, 200, 50, 50); + expect(result.wasTruncated).toBe(true); + expect(result.includedMessages.length).toBeLessThan(3); + // Most recent message should be included + if (result.includedMessages.length > 0) { + expect(result.includedMessages[result.includedMessages.length - 1].id).toBe('c'.repeat(100)); + } + }); + + it('returns empty array when budget is zero or negative', () => { + const messages = [makeMsg('test')]; + const result = determineMessagesToInclude(messages, 100, 60, 60); + expect(result.includedMessages).toHaveLength(0); + expect(result.wasTruncated).toBe(true); + }); + + it('preserves message order', () => { + const messages = [makeMsg('first'), makeMsg('second'), makeMsg('third')]; + const result = determineMessagesToInclude(messages, 50_000, 0, 0); + expect(result.includedMessages.map(m => m.id)).toEqual(['first', 'second', 'third']); + }); +}); diff --git a/packages/lib/src/monitoring/ai-context-calculator.ts b/packages/lib/src/monitoring/ai-context-calculator.ts index 940c16737..11729ed8f 100644 --- a/packages/lib/src/monitoring/ai-context-calculator.ts +++ b/packages/lib/src/monitoring/ai-context-calculator.ts @@ -5,9 +5,14 @@ * Tracks which messages are included in each API call to determine real context window usage. */ +import { MODEL_CONTEXT_WINDOWS } from './model-context-windows'; + /** * Minimal UIMessage type for token estimation * (Compatible with Vercel AI SDK UIMessage) + * + * Tool parts in the Vercel AI SDK use type `tool-{toolName}` (e.g. "tool-search") + * with fields: toolCallId, toolName, input, output, state. */ export interface UIMessage { id?: string; @@ -15,8 +20,13 @@ export interface UIMessage { parts?: Array<{ type: string; text?: string; + // Tool invocation fields (Vercel AI SDK DynamicToolUIPart) toolCallId?: string; toolName?: string; + input?: unknown; + output?: unknown; + state?: string; + // Legacy fields for backwards compatibility args?: unknown; result?: unknown; }>; @@ -55,9 +65,11 @@ export interface ContextCalculation { export function estimateTokens(text: string): number { if (!text) return 0; - // Detect non-ASCII heavy content (CJK, emoji, etc.) - // CJK Unified Ideographs, Hiragana, Katakana, Hangul, etc. - const nonAsciiCount = (text.match(/[^\x00-\x7F]/g) || []).length; + // Count non-ASCII characters efficiently via charCode loop (avoids regex array allocation) + let nonAsciiCount = 0; + for (let i = 0; i < text.length; i++) { + if (text.charCodeAt(i) > 127) nonAsciiCount++; + } const nonAsciiRatio = nonAsciiCount / text.length; // Use 2 chars/token when >20% non-ASCII (CJK-heavy), else 4 chars/token @@ -106,22 +118,24 @@ export function estimateMessageTokens(message: UIMessage): number { for (const part of message.parts) { if (part.type === 'text' && part.text) { tokens += estimateTokens(part.text); - } else if (part.type === 'tool-call' && part.toolCallId) { - // Tool call: function name + args - tokens += 10; // Tool call ID + } else if (part.type.startsWith('tool-')) { + // Vercel AI SDK tool invocation parts: type is "tool-{toolName}" + // Fields: toolCallId, toolName, input, output, state + tokens += 10; // Tool call ID overhead if (part.toolName) { tokens += estimateTokens(part.toolName); } - if (part.args) { - tokens += estimateTokens(JSON.stringify(part.args)); + // Input (tool arguments) + const inputData = part.input ?? part.args; + if (inputData) { + tokens += estimateTokens(JSON.stringify(inputData)); } - } else if (part.type === 'tool-result' && part.toolCallId) { - // Tool result: result data - tokens += 10; // Tool call ID - if (part.result) { - const resultStr = typeof part.result === 'string' - ? part.result - : JSON.stringify(part.result); + // Output (tool result) — only present when state is 'output-available' + const outputData = part.output ?? part.result; + if (outputData) { + const resultStr = typeof outputData === 'string' + ? outputData + : JSON.stringify(outputData); tokens += estimateTokens(resultStr); } } @@ -135,127 +149,129 @@ export function estimateMessageTokens(message: UIMessage): number { } /** - * Get context window size for a model - * Returns the maximum number of tokens the model can handle + * Get context window size for a model. + * + * Uses MODEL_CONTEXT_WINDOWS from ai-monitoring.ts as the canonical source of truth. + * Falls back to heuristic matching for models not in the map. */ export function getContextWindowSize(model: string, provider?: string): number { - const providerLower = provider?.toLowerCase() || ''; + // 1. Try exact match against the canonical map (imported at bottom of file to avoid circular deps) + const canonical = getCanonicalContextWindow(model, provider); + if (canonical !== undefined) return canonical; + + // 2. Heuristic fallbacks for models not in the canonical map const modelLower = model.toLowerCase(); + const providerLower = provider?.toLowerCase() || ''; // OpenAI models if (providerLower === 'openai' || modelLower.includes('gpt')) { - // GPT-5.2 models (400k/256k context) if (modelLower.includes('gpt-5.2')) { - if (modelLower.includes('mini') || modelLower.includes('nano')) { - return 256_000; - } - return 400_000; - } - // GPT-5.1 models (400k context) - if (modelLower.includes('gpt-5.1')) { - return 400_000; + return (modelLower.includes('mini') || modelLower.includes('nano')) ? 256_000 : 400_000; } - // GPT-5.0 models (272k/128k context) + if (modelLower.includes('gpt-5.1')) return 400_000; if (modelLower.includes('gpt-5')) { - if (modelLower.includes('mini') || modelLower.includes('nano')) { - return 128_000; - } - return 272_000; + return (modelLower.includes('mini') || modelLower.includes('nano')) ? 128_000 : 272_000; } - if (modelLower.includes('gpt-4o')) return 128_000; - if (modelLower.includes('gpt-4-turbo')) return 128_000; + if (modelLower.includes('gpt-4o') || modelLower.includes('gpt-4-turbo')) return 128_000; if (modelLower.includes('gpt-4')) return 8_192; if (modelLower.includes('gpt-3.5')) return 16_385; - return 200_000; // Default for newer OpenAI models + return 200_000; } // Anthropic models if (providerLower === 'anthropic' || modelLower.includes('claude')) { - if (modelLower.includes('claude-sonnet-4') || modelLower.includes('claude-4')) { - return 200_000; - } - if (modelLower.includes('claude-3-5') || modelLower.includes('claude-3')) { - return 200_000; - } - return 200_000; // Default for Anthropic + return 200_000; } // Google models if (providerLower === 'google' || modelLower.includes('gemini')) { - if (modelLower.includes('gemini-2.5-pro') || modelLower.includes('gemini-2-5-pro')) { - return 2_000_000; - } - if (modelLower.includes('gemini-2.5-flash') || modelLower.includes('gemini-2-5-flash')) { - return 1_000_000; - } - if (modelLower.includes('gemini-2.0-pro') || modelLower.includes('gemini-2-pro')) { - return 2_000_000; - } - if (modelLower.includes('gemini-2.0-flash') || modelLower.includes('gemini-2-flash')) { - return 1_000_000; - } + if (modelLower.includes('gemini-2.5-pro') || modelLower.includes('gemini-2-5-pro')) return 2_000_000; + if (modelLower.includes('gemini-2.5-flash') || modelLower.includes('gemini-2-5-flash')) return 1_000_000; + if (modelLower.includes('gemini-2.0-pro') || modelLower.includes('gemini-2-pro')) return 2_000_000; + if (modelLower.includes('gemini-2.0-flash') || modelLower.includes('gemini-2-flash')) return 1_000_000; if (modelLower.includes('gemini-1.5-pro')) return 2_000_000; if (modelLower.includes('gemini-1.5-flash')) return 1_000_000; if (modelLower.includes('gemini-pro')) return 32_000; - return 1_000_000; // Default for Google + return 1_000_000; } // xAI models if (providerLower === 'xai' || modelLower.includes('grok')) { if (modelLower.includes('grok-4-fast')) return 2_000_000; - if (modelLower.includes('grok')) return 128_000; return 128_000; } // PageSpace (GLM models) if (providerLower === 'pagespace' || modelLower.includes('glm')) { - if (modelLower.includes('glm-5')) return 200_000; - if (modelLower.includes('glm-4.7')) return 200_000; - if (modelLower.includes('glm-4.6')) return 200_000; if (modelLower.includes('glm-4.5')) return 128_000; - return 200_000; // Updated default for GLM + return 200_000; } // MiniMax models if (providerLower === 'minimax' || modelLower.includes('minimax')) { - if (modelLower.includes('m2.5')) return 1_000_000; - return 128_000; // Default for older MiniMax models + return 128_000; } - // OpenRouter - use model-specific limits where known, else 200k conservative default - if (providerLower === 'openrouter') { - // Claude models via OpenRouter - if (modelLower.includes('claude')) return 200_000; - // Gemini models via OpenRouter - if (modelLower.includes('gemini-2.5')) return 1_000_000; - if (modelLower.includes('gemini-2.0') || modelLower.includes('gemini-1.5')) return 1_000_000; - // GPT models via OpenRouter - if (modelLower.includes('gpt-5.2')) { - return modelLower.includes('mini') || modelLower.includes('nano') ? 256_000 : 400_000; - } - if (modelLower.includes('gpt-5.1')) return 400_000; - if (modelLower.includes('gpt-5')) { - return modelLower.includes('mini') || modelLower.includes('nano') ? 128_000 : 272_000; + // Unknown provider/model - conservative default + return 200_000; +} + +/** + * Attempt exact lookup in MODEL_CONTEXT_WINDOWS from ai-monitoring. + * Tries the model directly, then with provider prefix (e.g. "openai/gpt-5"). + */ +function getCanonicalContextWindow(model: string, provider?: string): number | undefined { + const windows = MODEL_CONTEXT_WINDOWS as Record; + + // Direct match (e.g. "gpt-5.2" or "anthropic/claude-opus-4.5") + if (windows[model] !== undefined) { + return windows[model]; + } + + // Try with provider prefix (e.g. provider="openrouter", model="gpt-5.2" → "openai/gpt-5.2") + if (provider) { + const providerPrefixes = getProviderPrefixes(provider, model); + for (const prefix of providerPrefixes) { + const key = `${prefix}/${model}`; + if (windows[key] !== undefined) { + return windows[key]; + } } - if (modelLower.includes('gpt-4o') || modelLower.includes('gpt-4-turbo')) return 128_000; - // DeepSeek models - commonly 64k or 128k - if (modelLower.includes('deepseek-r1') || modelLower.includes('deepseek-v3')) return 128_000; - if (modelLower.includes('deepseek')) return 64_000; - // Qwen models - if (modelLower.includes('qwen-2.5') || modelLower.includes('qwq')) return 128_000; - if (modelLower.includes('qwen')) return 32_000; - // Llama models - if (modelLower.includes('llama-3') || modelLower.includes('llama3')) return 128_000; - if (modelLower.includes('llama')) return 32_000; - // Mistral models - if (modelLower.includes('mistral-large') || modelLower.includes('mistral-nemo')) return 128_000; - if (modelLower.includes('mistral')) return 32_000; - // OpenRouter platform hard cap is 400k for many endpoints - use 200k as safe default - return 200_000; } - // Unknown provider/model - conservative default - return 200_000; + return undefined; +} + +/** + * Map provider/model to possible MODEL_CONTEXT_WINDOWS key prefixes + */ +function getProviderPrefixes(provider: string, model: string): string[] { + const p = provider.toLowerCase(); + const m = model.toLowerCase(); + + if (p === 'openai') return ['openai']; + if (p === 'anthropic') return ['anthropic']; + if (p === 'google') return ['google']; + if (p === 'xai') return ['x-ai']; + if (p === 'minimax') return ['minimax']; + if (p === 'pagespace') return ['z-ai']; + + // OpenRouter: model strings already include the provider prefix (e.g. "anthropic/claude-3.5-sonnet") + // but some may be bare model names — try common prefixes based on model name + if (p === 'openrouter') { + if (m.includes('claude')) return ['anthropic']; + if (m.includes('gpt') || m.includes('o3') || m.includes('o4') || m.includes('o1')) return ['openai']; + if (m.includes('gemini')) return ['google']; + if (m.includes('grok')) return ['x-ai']; + if (m.includes('llama')) return ['meta-llama']; + if (m.includes('mistral') || m.includes('codestral') || m.includes('devstral')) return ['mistralai']; + if (m.includes('deepseek')) return ['deepseek']; + if (m.includes('qwen') || m.includes('qwq')) return ['qwen']; + if (m.includes('minimax')) return ['minimax']; + if (m.includes('glm')) return ['z-ai']; + } + + return []; } /** diff --git a/packages/lib/src/monitoring/ai-monitoring.ts b/packages/lib/src/monitoring/ai-monitoring.ts index ea7fa40de..57be6421e 100644 --- a/packages/lib/src/monitoring/ai-monitoring.ts +++ b/packages/lib/src/monitoring/ai-monitoring.ts @@ -6,6 +6,8 @@ import { db, aiUsageLogs, sql, and, eq, gte, lte } from '@pagespace/db'; import { writeAiUsage } from '../logging/logger-database'; import { loggers } from '../logging/logger-config'; +import { MODEL_CONTEXT_WINDOWS, getContextWindow } from './model-context-windows'; +export { MODEL_CONTEXT_WINDOWS, getContextWindow }; /** * AI Provider Pricing (per 1M tokens) @@ -188,198 +190,9 @@ export const AI_PRICING = { 'default': { input: 0, output: 0 } } as const; -/** - * Model Context Window Sizes (in tokens) - * Maximum context length for each model - * Updated November 2025 - */ -export const MODEL_CONTEXT_WINDOWS = { - // OpenRouter Models - Anthropic - 'anthropic/claude-opus-4.5': 200000, - 'anthropic/claude-sonnet-4.5': 200000, - 'anthropic/claude-haiku-4.5': 200000, - 'anthropic/claude-3.5-sonnet': 200000, - 'anthropic/claude-3-haiku': 200000, - 'anthropic/claude-opus-4.1': 200000, - - // OpenRouter Models - OpenAI - 'openai/gpt-5.2': 400000, - 'openai/gpt-5.2-codex': 400000, - 'openai/gpt-5.2-mini': 256000, - 'openai/gpt-5.2-nano': 256000, - 'openai/gpt-5.1': 400000, - 'openai/gpt-5.1-codex': 400000, - 'openai/gpt-5.1-codex-mini': 400000, - 'openai/gpt-4o': 128000, - 'openai/gpt-4o-mini': 128000, - 'openai/o3-deep-research': 200000, - 'openai/o4-mini-deep-research': 200000, - 'openai/gpt-5': 272000, - 'openai/gpt-5-mini': 128000, - 'openai/gpt-5-nano': 128000, - 'openai/gpt-oss-120b': 128000, - 'openai/gpt-oss-20b': 128000, - - // OpenRouter Models - Other - 'meta-llama/llama-3.1-405b-instruct': 128000, - 'mistralai/mistral-medium-3.1': 128000, - 'mistralai/mistral-small-3.2-24b-instruct': 32000, - 'mistralai/codestral-2508': 32000, - 'mistralai/devstral-medium': 128000, - 'mistralai/devstral-small': 128000, - - // OpenRouter Models - Google - 'google/gemini-3-pro-preview': 1048576, - 'google/gemini-3-flash-preview': 1048576, - 'google/gemini-2.5-pro': 2000000, - 'google/gemini-2.5-flash': 1000000, - 'google/gemini-2.5-flash-lite': 1000000, - 'google/gemini-2.5-flash-lite-preview-06-17': 1000000, - 'google/gemini-2.0-pro': 2000000, - 'google/gemini-2.0-flash': 1000000, - - // OpenRouter Models - Chinese/Asian - 'z-ai/glm-4.7': 200000, - 'z-ai/glm-4.5v': 128000, - 'z-ai/glm-4.5': 128000, - 'z-ai/glm-4.5-air': 128000, - 'z-ai/glm-4-32b': 128000, - 'qwen/qwen3-max': 128000, - 'qwen/qwen3-235b-a22b-thinking-2507': 128000, - 'qwen/qwen3-235b-a22b-2507': 128000, - 'qwen/qwen3-coder': 128000, - 'moonshotai/kimi-k2': 128000, - 'minimax/minimax-m1': 128000, - 'z-ai/glm-5': 202752, - 'minimax/minimax-m2.5': 204800, - - // OpenRouter Models - DeepSeek - 'deepseek/deepseek-v3.1-terminus': 128000, - - // OpenRouter Models - AI21 - 'ai21/jamba-mini-1.7': 256000, - 'ai21/jamba-large-1.7': 256000, - - // OpenRouter Models - xAI - 'x-ai/grok-4-fast': 2000000, - 'x-ai/grok-4': 128000, - - // OpenRouter Models - Other - 'inception/mercury': 128000, - - // Google AI Direct Models - 'gemini-3-pro': 1048576, - 'gemini-3-flash-preview': 1048576, - 'gemini-2.5-pro': 2000000, - 'gemini-2.5-flash': 1000000, - 'gemini-2.5-flash-lite': 1000000, - 'gemini-2.0-pro-exp': 2000000, - 'gemini-2.0-flash': 1000000, - 'gemini-2.0-flash-exp': 1000000, - 'gemini-2.0-flash-lite': 1000000, - 'gemini-1.5-flash': 1000000, - 'gemini-1.5-flash-8b': 1000000, - 'gemini-1.5-pro': 2000000, - - // OpenAI Direct Models - 'gpt-5.2': 400000, - 'gpt-5.2-codex': 400000, - 'gpt-5.2-mini': 256000, - 'gpt-5.2-nano': 256000, - 'gpt-5.1': 400000, - 'gpt-5.1-codex': 400000, - 'gpt-5': 272000, - 'gpt-5-mini': 128000, - 'gpt-5-nano': 128000, - 'gpt-4.1-2025-04-14': 400000, - 'gpt-4.1-mini-2025-04-14': 400000, - 'gpt-4.1-nano-2025-04-14': 400000, - 'gpt-4o': 128000, - 'gpt-4o-mini': 128000, - 'gpt-4o-audio-preview': 128000, - 'gpt-4-turbo': 128000, - 'gpt-4': 8192, - 'gpt-3.5-turbo': 16385, - 'o4-mini-2025-04-16': 200000, - 'o3': 200000, - 'o3-mini': 200000, - 'o1': 200000, - 'o1-mini': 200000, - 'o1-preview': 200000, - - // Anthropic Direct Models - 'claude-opus-4-5-20251124': 200000, - 'claude-sonnet-4-5': 200000, - 'claude-sonnet-4-5-20250929': 200000, - 'claude-haiku-4-5-20251001': 200000, - 'claude-opus-4-1-20250805': 200000, - 'claude-sonnet-4-1-20250805': 200000, - 'claude-3-7-sonnet-20250219': 200000, - 'claude-3-5-sonnet-20241022': 200000, - 'claude-3-5-sonnet-20240620': 200000, - 'claude-3-5-sonnet-latest': 200000, - 'claude-3-5-haiku-20241022': 200000, - 'claude-3-5-haiku-latest': 200000, - 'claude-3-opus-20240229': 200000, - 'claude-3-opus-latest': 200000, - 'claude-3-sonnet-20240229': 200000, - 'claude-3-haiku-20240307': 200000, - - // xAI Models - 'grok-4': 128000, - 'grok-4-fast-reasoning': 2000000, - 'grok-4-fast-non-reasoning': 2000000, - 'grok-code-fast-1': 128000, - 'grok-3': 128000, - 'grok-3-latest': 128000, - 'grok-3-fast': 128000, - 'grok-3-fast-latest': 128000, - 'grok-3-mini': 128000, - 'grok-3-mini-latest': 128000, - 'grok-3-mini-fast': 128000, - 'grok-3-mini-fast-latest': 128000, - 'grok-2': 128000, - 'grok-2-latest': 128000, - 'grok-2-1212': 128000, - 'grok-2-vision': 128000, - 'grok-2-vision-latest': 128000, - 'grok-2-vision-1212': 128000, - 'grok-beta': 128000, - 'grok-vision-beta': 128000, - - // MiniMax Direct Models - 'MiniMax-M2.5': 1000000, - 'MiniMax-M2.1': 128000, - 'MiniMax-M2': 128000, - 'MiniMax-M2-Stable': 128000, - - // PageSpace/GLM Models - 'glm-5': 200000, - 'glm-4.7': 200000, - 'glm-4.6': 200000, - 'glm-4.5': 128000, - 'glm-4.5-air': 128000, - - // Ollama (local) - context varies by model and configuration - 'llama3.2': 128000, - 'llama3.2-vision': 128000, - 'llama3.1': 128000, - 'qwen2.5-coder': 32000, - 'deepseek-r1': 64000, - 'gemma2': 8192, - 'mistral': 32000, - 'phi3': 128000, - - // Default - 'default': 200000 // Updated default for newer models -} as const; - -/** - * Get context window size for a model - */ -export function getContextWindow(model: string): number { - return MODEL_CONTEXT_WINDOWS[model as keyof typeof MODEL_CONTEXT_WINDOWS] || MODEL_CONTEXT_WINDOWS.default; -} +// MODEL_CONTEXT_WINDOWS and getContextWindow are re-exported from model-context-windows.ts +// (imported at top of file) to maintain backwards compatibility. +// The canonical source of truth is packages/lib/src/monitoring/model-context-windows.ts. /** * Calculate cost based on tokens and model diff --git a/packages/lib/src/monitoring/model-context-windows.ts b/packages/lib/src/monitoring/model-context-windows.ts new file mode 100644 index 000000000..48cab95b4 --- /dev/null +++ b/packages/lib/src/monitoring/model-context-windows.ts @@ -0,0 +1,198 @@ +/** + * Model Context Window Sizes (in tokens) + * + * Canonical source of truth for all model context window limits. + * Shared between ai-monitoring.ts and ai-context-calculator.ts. + * + * This file is deliberately dependency-free so it can be imported + * from lightweight modules without pulling in database or logging deps. + * + * Updated November 2025 + */ +export const MODEL_CONTEXT_WINDOWS = { + // OpenRouter Models - Anthropic + 'anthropic/claude-opus-4.5': 200000, + 'anthropic/claude-sonnet-4.5': 200000, + 'anthropic/claude-haiku-4.5': 200000, + 'anthropic/claude-3.5-sonnet': 200000, + 'anthropic/claude-3-haiku': 200000, + 'anthropic/claude-opus-4.1': 200000, + + // OpenRouter Models - OpenAI + 'openai/gpt-5.2': 400000, + 'openai/gpt-5.2-codex': 400000, + 'openai/gpt-5.2-mini': 256000, + 'openai/gpt-5.2-nano': 256000, + 'openai/gpt-5.1': 400000, + 'openai/gpt-5.1-codex': 400000, + 'openai/gpt-5.1-codex-mini': 400000, + 'openai/gpt-4o': 128000, + 'openai/gpt-4o-mini': 128000, + 'openai/o3-deep-research': 200000, + 'openai/o4-mini-deep-research': 200000, + 'openai/gpt-5': 272000, + 'openai/gpt-5-mini': 128000, + 'openai/gpt-5-nano': 128000, + 'openai/gpt-oss-120b': 128000, + 'openai/gpt-oss-20b': 128000, + + // OpenRouter Models - Other + 'meta-llama/llama-3.1-405b-instruct': 128000, + 'mistralai/mistral-medium-3.1': 128000, + 'mistralai/mistral-small-3.2-24b-instruct': 32000, + 'mistralai/codestral-2508': 32000, + 'mistralai/devstral-medium': 128000, + 'mistralai/devstral-small': 128000, + + // OpenRouter Models - Google + 'google/gemini-3-pro-preview': 1048576, + 'google/gemini-3-flash-preview': 1048576, + 'google/gemini-2.5-pro': 2000000, + 'google/gemini-2.5-flash': 1000000, + 'google/gemini-2.5-flash-lite': 1000000, + 'google/gemini-2.5-flash-lite-preview-06-17': 1000000, + 'google/gemini-2.0-pro': 2000000, + 'google/gemini-2.0-flash': 1000000, + + // OpenRouter Models - Chinese/Asian + 'z-ai/glm-4.7': 200000, + 'z-ai/glm-4.5v': 128000, + 'z-ai/glm-4.5': 128000, + 'z-ai/glm-4.5-air': 128000, + 'z-ai/glm-4-32b': 128000, + 'qwen/qwen3-max': 128000, + 'qwen/qwen3-235b-a22b-thinking-2507': 128000, + 'qwen/qwen3-235b-a22b-2507': 128000, + 'qwen/qwen3-coder': 128000, + 'moonshotai/kimi-k2': 128000, + 'minimax/minimax-m1': 128000, + 'z-ai/glm-5': 202752, + 'minimax/minimax-m2.5': 204800, + + // OpenRouter Models - DeepSeek + 'deepseek/deepseek-v3.1-terminus': 128000, + + // OpenRouter Models - AI21 + 'ai21/jamba-mini-1.7': 256000, + 'ai21/jamba-large-1.7': 256000, + + // OpenRouter Models - xAI + 'x-ai/grok-4-fast': 2000000, + 'x-ai/grok-4': 128000, + + // OpenRouter Models - Other + 'inception/mercury': 128000, + + // Google AI Direct Models + 'gemini-3-pro': 1048576, + 'gemini-3-flash-preview': 1048576, + 'gemini-2.5-pro': 2000000, + 'gemini-2.5-flash': 1000000, + 'gemini-2.5-flash-lite': 1000000, + 'gemini-2.0-pro-exp': 2000000, + 'gemini-2.0-flash': 1000000, + 'gemini-2.0-flash-exp': 1000000, + 'gemini-2.0-flash-lite': 1000000, + 'gemini-1.5-flash': 1000000, + 'gemini-1.5-flash-8b': 1000000, + 'gemini-1.5-pro': 2000000, + + // OpenAI Direct Models + 'gpt-5.2': 400000, + 'gpt-5.2-codex': 400000, + 'gpt-5.2-mini': 256000, + 'gpt-5.2-nano': 256000, + 'gpt-5.1': 400000, + 'gpt-5.1-codex': 400000, + 'gpt-5': 272000, + 'gpt-5-mini': 128000, + 'gpt-5-nano': 128000, + 'gpt-4.1-2025-04-14': 400000, + 'gpt-4.1-mini-2025-04-14': 400000, + 'gpt-4.1-nano-2025-04-14': 400000, + 'gpt-4o': 128000, + 'gpt-4o-mini': 128000, + 'gpt-4o-audio-preview': 128000, + 'gpt-4-turbo': 128000, + 'gpt-4': 8192, + 'gpt-3.5-turbo': 16385, + 'o4-mini-2025-04-16': 200000, + 'o3': 200000, + 'o3-mini': 200000, + 'o1': 200000, + 'o1-mini': 200000, + 'o1-preview': 200000, + + // Anthropic Direct Models + 'claude-opus-4-5-20251124': 200000, + 'claude-sonnet-4-5': 200000, + 'claude-sonnet-4-5-20250929': 200000, + 'claude-haiku-4-5-20251001': 200000, + 'claude-opus-4-1-20250805': 200000, + 'claude-sonnet-4-1-20250805': 200000, + 'claude-3-7-sonnet-20250219': 200000, + 'claude-3-5-sonnet-20241022': 200000, + 'claude-3-5-sonnet-20240620': 200000, + 'claude-3-5-sonnet-latest': 200000, + 'claude-3-5-haiku-20241022': 200000, + 'claude-3-5-haiku-latest': 200000, + 'claude-3-opus-20240229': 200000, + 'claude-3-opus-latest': 200000, + 'claude-3-sonnet-20240229': 200000, + 'claude-3-haiku-20240307': 200000, + + // xAI Models + 'grok-4': 128000, + 'grok-4-fast-reasoning': 2000000, + 'grok-4-fast-non-reasoning': 2000000, + 'grok-code-fast-1': 128000, + 'grok-3': 128000, + 'grok-3-latest': 128000, + 'grok-3-fast': 128000, + 'grok-3-fast-latest': 128000, + 'grok-3-mini': 128000, + 'grok-3-mini-latest': 128000, + 'grok-3-mini-fast': 128000, + 'grok-3-mini-fast-latest': 128000, + 'grok-2': 128000, + 'grok-2-latest': 128000, + 'grok-2-1212': 128000, + 'grok-2-vision': 128000, + 'grok-2-vision-latest': 128000, + 'grok-2-vision-1212': 128000, + 'grok-beta': 128000, + 'grok-vision-beta': 128000, + + // MiniMax Direct Models + 'MiniMax-M2.5': 1000000, + 'MiniMax-M2.1': 128000, + 'MiniMax-M2': 128000, + 'MiniMax-M2-Stable': 128000, + + // PageSpace/GLM Models + 'glm-5': 202752, + 'glm-4.7': 200000, + 'glm-4.6': 200000, + 'glm-4.5': 128000, + 'glm-4.5-air': 128000, + + // Ollama (local) - context varies by model and configuration + 'llama3.2': 128000, + 'llama3.2-vision': 128000, + 'llama3.1': 128000, + 'qwen2.5-coder': 32000, + 'deepseek-r1': 64000, + 'gemma2': 8192, + 'mistral': 32000, + 'phi3': 128000, + + // Default + 'default': 200000, +} as const; + +/** + * Get context window size for a model (simple key lookup) + */ +export function getContextWindow(model: string): number { + return MODEL_CONTEXT_WINDOWS[model as keyof typeof MODEL_CONTEXT_WINDOWS] || MODEL_CONTEXT_WINDOWS.default; +}