Skip to content

show token usage in evals #572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
331 changes: 331 additions & 0 deletions evals/BraintrustClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
import OpenAI from "openai";
import type { ClientOptions } from "openai";
import { zodToJsonSchema } from "zod-to-json-schema";
import { LogLine } from "../types/log";
import { AvailableModel } from "../types/model";
import { LLMCache } from "../lib/cache/LLMCache";
import {
ChatMessage,
CreateChatCompletionOptions,
LLMClient,
LLMResponse,
} from "../lib/llm/LLMClient";
import { wrapOpenAI } from "braintrust";

export class BraintrustClient extends LLMClient {
public type = "braintrust" as const;
private client: OpenAI;
private cache: LLMCache | undefined;
private enableCaching: boolean;
public clientOptions: ClientOptions;
public hasVision = false;

constructor({
enableCaching = false,
cache,
modelName,
clientOptions,
userProvidedInstructions,
}: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
userProvidedInstructions?: string;
}) {
super(modelName, userProvidedInstructions);

// Create OpenAI client with the base URL set to Braintrust API
this.client = wrapOpenAI(
new OpenAI({
baseURL: "https://api.braintrust.dev/v1/proxy",
apiKey: clientOptions?.apiKey || process.env.BRAINTRUST_API_KEY,
...clientOptions,
}),
);

this.cache = cache;
this.enableCaching = enableCaching;
this.modelName = modelName;
this.clientOptions = clientOptions;
}

async createChatCompletion<T = LLMResponse>({
options,
retries,
logger,
}: CreateChatCompletionOptions): Promise<T> {
const optionsWithoutImage = { ...options };
delete optionsWithoutImage.image;

logger({
category: "braintrust",
message: "creating chat completion",
level: 1,
auxiliary: {
options: {
value: JSON.stringify(optionsWithoutImage),
type: "object",
},
},
});

// Try to get cached response
const cacheOptions = {
model: this.modelName,
messages: options.messages,
temperature: options.temperature,
response_model: options.response_model,
tools: options.tools,
retries: retries,
};

if (this.enableCaching) {
const cachedResponse = await this.cache.get<T>(
cacheOptions,
options.requestId,
);
if (cachedResponse) {
logger({
category: "llm_cache",
message: "LLM cache hit - returning cached response",
level: 1,
auxiliary: {
cachedResponse: {
value: JSON.stringify(cachedResponse),
type: "object",
},
requestId: {
value: options.requestId,
type: "string",
},
cacheOptions: {
value: JSON.stringify(cacheOptions),
type: "object",
},
},
});
return cachedResponse as T;
}
}

// Format messages for Braintrust API (using OpenAI format)
const formattedMessages = options.messages.map((msg: ChatMessage) => {
const baseMessage = {
content:
typeof msg.content === "string"
? msg.content
: Array.isArray(msg.content) &&
msg.content.length > 0 &&
"text" in msg.content[0]
? msg.content[0].text
: "",
};

// Braintrust only supports system, user, and assistant roles
if (msg.role === "system") {
return { ...baseMessage, role: "system" as const };
} else if (msg.role === "assistant") {
return { ...baseMessage, role: "assistant" as const };
} else {
// Default to user for any other role
return { ...baseMessage, role: "user" as const };
}
});

// Format tools if provided
let tools = options.tools?.map((tool) => ({
type: "function" as const,
function: {
name: tool.name,
description: tool.description,
parameters: {
type: "object",
properties: tool.parameters.properties,
required: tool.parameters.required,
},
},
}));

// Add response model as a tool if provided
if (options.response_model) {
const jsonSchema = zodToJsonSchema(options.response_model.schema) as {
properties?: Record<string, unknown>;
required?: string[];
};
const schemaProperties = jsonSchema.properties || {};
const schemaRequired = jsonSchema.required || [];

const responseTool = {
type: "function" as const,
function: {
name: "print_extracted_data",
description:
"Prints the extracted data based on the provided schema.",
parameters: {
type: "object",
properties: schemaProperties,
required: schemaRequired,
},
},
};

tools = tools ? [...tools, responseTool] : [responseTool];
}

try {
// Use OpenAI client with Braintrust API
const apiResponse = await this.client.chat.completions.create({
model: this.modelName,
messages: [
...formattedMessages,
// Add explicit instruction to return JSON if we have a response model
...(options.response_model
? [
{
role: "system" as const,
content: `IMPORTANT: Your response must be valid JSON that matches this schema: ${JSON.stringify(options.response_model.schema)}`,
},
]
: []),
],
temperature: options.temperature || 0.7,
max_tokens: options.maxTokens,
tools: tools,
tool_choice: options.tool_choice || "auto",
});

// Format the response to match the expected LLMResponse format
const response: LLMResponse = {
id: apiResponse.id,
object: "chat.completion",
created: Date.now(),
model: this.modelName,
choices: [
{
index: 0,
message: {
role: "assistant",
content: apiResponse.choices[0]?.message?.content || null,
tool_calls: apiResponse.choices[0]?.message?.tool_calls || [],
},
finish_reason: apiResponse.choices[0]?.finish_reason || "stop",
},
],
usage: {
prompt_tokens: apiResponse.usage?.prompt_tokens || 0,
completion_tokens: apiResponse.usage?.completion_tokens || 0,
total_tokens: apiResponse.usage?.total_tokens || 0,
},
};

logger({
category: "braintrust",
message: "response",
level: 1,
auxiliary: {
response: {
value: JSON.stringify(response),
type: "object",
},
requestId: {
value: options.requestId,
type: "string",
},
},
});

if (options.response_model) {
// First try standard function calling format
const toolCall = response.choices[0]?.message?.tool_calls?.[0];
if (toolCall?.function?.arguments) {
try {
const result = JSON.parse(toolCall.function.arguments);
if (this.enableCaching) {
this.cache.set(cacheOptions, result, options.requestId);
}
return result as T;
} catch (e) {
// If JSON parse fails, the model might be returning a different format
logger({
category: "braintrust",
message: "failed to parse tool call arguments as JSON, retrying",
level: 1,
auxiliary: {
error: {
value: e.message,
type: "string",
},
},
});
}
}

// If we have content but no tool calls, try to parse the content as JSON
const content = response.choices[0]?.message?.content;
if (content) {
try {
// Try to extract JSON from the content
const jsonMatch = content.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const result = JSON.parse(jsonMatch[0]);
if (this.enableCaching) {
this.cache.set(cacheOptions, result, options.requestId);
}
return result as T;
}
} catch (e) {
logger({
category: "braintrust",
message: "failed to parse content as JSON",
level: 1,
auxiliary: {
error: {
value: e.message,
type: "string",
},
},
});
}
}

// If we still haven't found valid JSON and have retries left, try again
if (!retries || retries < 5) {
return this.createChatCompletion({
options,
logger,
retries: (retries ?? 0) + 1,
});
}

throw new Error(
"Create Chat Completion Failed: Could not extract valid JSON from response",
);
}

if (this.enableCaching) {
this.cache.set(cacheOptions, response, options.requestId);
}

return response as T;
} catch (error) {
logger({
category: "braintrust",
message: "error creating chat completion",
level: 1,
auxiliary: {
error: {
value: error.message,
type: "string",
},
requestId: {
value: options.requestId,
type: "string",
},
},
});
throw error;
}
}
}
9 changes: 3 additions & 6 deletions evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ const StagehandConfig = {
headless: false,
enableCaching,
domSettleTimeoutMs: 30_000,
modelName: "gpt-4o", // default model, can be overridden by initStagehand arguments
modelName: "braintrust-gpt-4o", // default model, can be overridden by initStagehand arguments
modelClientOptions: {
apiKey: process.env.OPENAI_API_KEY,
apiKey: process.env.BRAINTRUST_API_KEY,
},
logger: (logLine: LogLine) =>
console.log(`[stagehand::${logLine.category}] ${logLine.message}`),
Expand Down Expand Up @@ -63,10 +63,7 @@ export const initStagehand = async ({
configOverrides?: Partial<ConstructorParams>;
actTimeoutMs?: number;
}) => {
let chosenApiKey: string | undefined = process.env.OPENAI_API_KEY;
if (modelName.startsWith("claude")) {
chosenApiKey = process.env.ANTHROPIC_API_KEY;
}
const chosenApiKey = process.env.BRAINTRUST_API_KEY;

const config = {
...StagehandConfig,
Expand Down
12 changes: 11 additions & 1 deletion evals/taskConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,17 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
*/
const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
? process.env.EVAL_MODELS.split(",")
: ["gpt-4o", "claude-3-5-sonnet-latest"];
: [
"braintrust-gpt-4o",
"braintrust-gpt-4.5-preview",
"braintrust-gpt-4o-mini",
"braintrust-claude-3-5-sonnet-latest",
"braintrust-claude-3-7-sonnet-latest",
"braintrust-gemini-2.0-flash",
"braintrust-llama-3.3-70b-versatile",
"braintrust-llama-3.1-8b-instant",
"braintrust-deepseek-r1-distill-llama-70b",
];

/**
* getModelList:
Expand Down
8 changes: 7 additions & 1 deletion lib/llm/LLMClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,13 @@ export interface CreateChatCompletionOptions {
}

export abstract class LLMClient {
public type: "openai" | "anthropic" | "cerebras" | "groq" | string;
public type:
| "openai"
| "anthropic"
| "cerebras"
| "groq"
| "braintrust"
| string;
public modelName: AvailableModel;
public hasVision: boolean;
public clientOptions: ClientOptions;
Expand Down
Loading