Skip to content

Commit 906ee4d

Browse files
committed
feat: add support for rating overrides
Adds a `ratingOverrides` field to the environment which makes it easier to override the weight for a specific rating without having to re-define it.
1 parent 0a0b025 commit 906ee4d

File tree

3 files changed

+33
-5
lines changed

3 files changed

+33
-5
lines changed

runner/configuration/environment-config.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import z from 'zod';
22
import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
33
import {UserFacingError} from '../utils/errors.js';
4-
import {ratingSchema} from '../ratings/rating-types.js';
4+
import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
55
import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
66
import {executorSchema} from '../orchestration/executors/executor.js';
77
import {
@@ -21,6 +21,11 @@ export const environmentConfigSchema = z.object({
2121
clientSideFramework: z.string(),
2222
/** Ratings to run when evaluating the environment. */
2323
ratings: z.array(ratingSchema),
24+
/**
25+
* Map used to override fields for specific ratings. The key is the unique ID of
26+
* the rating and the value are the override fields.
27+
*/
28+
ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(),
2429
/** Path to the prompt used by the LLM for generating files. */
2530
generationSystemPrompt: z.string(),
2631
/**

runner/configuration/environment.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ export class Environment {
6969

7070
/** Prompts that should be executed as a part of the evaluation. */
7171
executablePrompts = lazy(async () => {
72-
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings);
72+
return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
7373
});
7474

7575
systemPromptGeneration = lazy(async () => {
@@ -166,15 +166,32 @@ export class Environment {
166166

167167
/**
168168
* Resolves the prompt configuration into prompt definitions.
169-
* @param rootPath Root path of the project.
170169
* @param prompts Prompts to be resolved.
171-
* @param envRatings Environment-level ratings.
170+
* @param config Configuration for the environment.
172171
*/
173172
private async resolveExecutablePrompts(
174173
prompts: EnvironmentConfig['executablePrompts'],
175-
envRatings: Rating[],
174+
config: EnvironmentConfig,
176175
): Promise<RootPromptDefinition[]> {
177176
const result: Promise<RootPromptDefinition>[] = [];
177+
let envRatings: Rating[];
178+
179+
if (config.ratingOverrides) {
180+
Object.keys(config.ratingOverrides).forEach(id => {
181+
if (!config.ratings.some(rating => rating.id === id)) {
182+
throw new UserFacingError(
183+
`Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
184+
);
185+
}
186+
});
187+
188+
envRatings = config.ratings.map(rating => {
189+
const override = config.ratingOverrides![rating.id];
190+
return override ? {...rating, ...override} : rating;
191+
});
192+
} else {
193+
envRatings = config.ratings;
194+
}
178195

179196
for (const def of prompts) {
180197
if (def instanceof MultiStepPrompt) {

runner/ratings/rating-types.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,12 @@ export const ratingSchema = z.union([
126126
llmBasedRatingSchema,
127127
]);
128128

129+
export const ratingOverrideSchema = z.object({
130+
category: z.custom<RatingCategory>().optional(),
131+
scoreReduction: z.custom<`${number}%`>().optional(),
132+
groupingLabels: z.array(z.string()).optional().optional(),
133+
});
134+
129135
/** Result of a per-build rating. */
130136
export type PerBuildRatingResult =
131137
| {

0 commit comments

Comments
 (0)