feat: add support for rating overrides

crisbeto · crisbeto · commit 906ee4dee408 · 2025-12-04T11:16:37.000+01:00
Adds a `ratingOverrides` field to the environment which makes it easier to override the weight for a specific rating without having to re-define it.
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
@@ -1,7 +1,7 @@
 import z from 'zod';
 import {createMessageBuilder, fromError} from 'zod-validation-error/v3';
 import {UserFacingError} from '../utils/errors.js';
-import {ratingSchema} from '../ratings/rating-types.js';
+import {ratingOverrideSchema, ratingSchema} from '../ratings/rating-types.js';
 import {EvalPrompt, EvalPromptWithMetadata, MultiStepPrompt} from './prompts.js';
 import {executorSchema} from '../orchestration/executors/executor.js';
 import {
@@ -21,6 +21,11 @@ export const environmentConfigSchema = z.object({
   clientSideFramework: z.string(),
   /** Ratings to run when evaluating the environment. */
   ratings: z.array(ratingSchema),
+  /**
+   * Map used to override fields for specific ratings. The key is the unique ID of
+   * the rating and the value are the override fields.
+   */
+  ratingOverrides: z.record(z.string(), ratingOverrideSchema).optional(),
   /** Path to the prompt used by the LLM for generating files. */
   generationSystemPrompt: z.string(),
   /**
diff --git a/runner/configuration/environment.ts b/runner/configuration/environment.ts
@@ -69,7 +69,7 @@ export class Environment {
 
   /** Prompts that should be executed as a part of the evaluation. */
   executablePrompts = lazy(async () => {
-    return this.resolveExecutablePrompts(this.config.executablePrompts, this.config.ratings);
+    return this.resolveExecutablePrompts(this.config.executablePrompts, this.config);
   });
 
   systemPromptGeneration = lazy(async () => {
@@ -166,15 +166,32 @@ export class Environment {
 
   /**
    * Resolves the prompt configuration into prompt definitions.
-   * @param rootPath Root path of the project.
    * @param prompts Prompts to be resolved.
-   * @param envRatings Environment-level ratings.
+   * @param config Configuration for the environment.
    */
   private async resolveExecutablePrompts(
     prompts: EnvironmentConfig['executablePrompts'],
-    envRatings: Rating[],
+    config: EnvironmentConfig,
   ): Promise<RootPromptDefinition[]> {
     const result: Promise<RootPromptDefinition>[] = [];
+    let envRatings: Rating[];
+
+    if (config.ratingOverrides) {
+      Object.keys(config.ratingOverrides).forEach(id => {
+        if (!config.ratings.some(rating => rating.id === id)) {
+          throw new UserFacingError(
+            `Rating with an ID of "${id}" has not been configured. Cannot apply an override to it.`,
+          );
+        }
+      });
+
+      envRatings = config.ratings.map(rating => {
+        const override = config.ratingOverrides![rating.id];
+        return override ? {...rating, ...override} : rating;
+      });
+    } else {
+      envRatings = config.ratings;
+    }
 
     for (const def of prompts) {
       if (def instanceof MultiStepPrompt) {
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
@@ -126,6 +126,12 @@ export const ratingSchema = z.union([
   llmBasedRatingSchema,
 ]);
 
+export const ratingOverrideSchema = z.object({
+  category: z.custom<RatingCategory>().optional(),
+  scoreReduction: z.custom<`${number}%`>().optional(),
+  groupingLabels: z.array(z.string()).optional().optional(),
+});
+
 /** Result of a per-build rating. */
 export type PerBuildRatingResult =
   | {