Skip to content

Commit e88e667

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Data model for Rubric based metric and eval config
Details: - We plan on introducing Rubric based metrics in subsequent changes. This change introduces the data model needed that allows agent developer to provide rubrics. - We also introduce a data model for the config that the eval system has been using for quite some time. It was loosely and informally described as a dictionary of metric names and expected thresholds. In this change, we actually formalize it using a pydantic data model, and extend it allow developers to specify rubrics as a part of their eval config. What is a rubric based metric? A rubric based metric is the assessment of a Agent's response (final or intermediate) along some rubric. This evaluation of agent's response significantly differs from the strategy where one has to provide a golden response. PiperOrigin-RevId: 805488436
1 parent 37228be commit e88e667

File tree

14 files changed

+484
-98
lines changed

14 files changed

+484
-98
lines changed

src/google/adk/cli/cli_eval.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
from ..evaluation.base_eval_service import InferenceResult
3838
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
3939
from ..evaluation.eval_case import EvalCase
40+
from ..evaluation.eval_config import BaseCriterion
41+
from ..evaluation.eval_config import EvalConfig
4042
from ..evaluation.eval_metrics import EvalMetric
4143
from ..evaluation.eval_metrics import EvalMetricResult
4244
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
@@ -64,6 +66,10 @@
6466
RESPONSE_MATCH_SCORE_KEY: 0.8,
6567
}
6668

69+
_DEFAULT_EVAL_CONFIG = EvalConfig(
70+
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
71+
)
72+
6773

6874
def _import_from_path(module_name, file_path):
6975
spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):
8187

8288
def get_evaluation_criteria_or_default(
8389
eval_config_file_path: str,
84-
) -> dict[str, float]:
85-
"""Returns evaluation criteria from the config file, if present.
90+
) -> EvalConfig:
91+
"""Returns EvalConfig read from the config file, if present.
8692
8793
Otherwise a default one is returned.
8894
"""
8995
if eval_config_file_path:
9096
with open(eval_config_file_path, "r", encoding="utf-8") as f:
91-
config_data = json.load(f)
97+
content = f.read()
98+
return EvalConfig.model_validate_json(content)
99+
100+
logger.info("No config file supplied. Using default criteria.")
101+
return _DEFAULT_EVAL_CONFIG
102+
103+
104+
def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
105+
"""Returns a list of EvalMetrics mapped from the EvalConfig."""
106+
eval_metric_list = []
107+
if eval_config.criteria:
108+
for metric_name, criterion in eval_config.criteria.items():
109+
if isinstance(criterion, float):
110+
eval_metric_list.append(
111+
EvalMetric(
112+
metric_name=metric_name,
113+
threshold=criterion,
114+
criterion=BaseCriterion(threshold=criterion),
115+
)
116+
)
117+
elif isinstance(criterion, BaseCriterion):
118+
eval_metric_list.append(
119+
EvalMetric(
120+
metric_name=metric_name,
121+
threshold=criterion.threshold,
122+
criterion=criterion,
123+
)
124+
)
125+
else:
126+
raise ValueError(
127+
f"Unexpected criterion type. {type(criterion).__name__} not"
128+
" supported."
129+
)
92130

93-
if "criteria" in config_data and isinstance(config_data["criteria"], dict):
94-
evaluation_criteria = config_data["criteria"]
95-
else:
96-
raise ValueError(
97-
f"Invalid format for test_config.json at {eval_config_file_path}."
98-
" Expected a 'criteria' dictionary."
99-
)
100-
else:
101-
logger.info("No config file supplied. Using default criteria.")
102-
evaluation_criteria = DEFAULT_CRITERIA
103-
104-
return evaluation_criteria
131+
return eval_metric_list
105132

106133

107134
def get_root_agent(agent_module_file_path: str) -> Agent:

src/google/adk/cli/cli_tools_click.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -382,24 +382,16 @@ def cli_eval(
382382
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
383383
from .cli_eval import _collect_eval_results
384384
from .cli_eval import _collect_inferences
385+
from .cli_eval import get_eval_metrics_from_config
385386
from .cli_eval import get_evaluation_criteria_or_default
386387
from .cli_eval import get_root_agent
387388
from .cli_eval import parse_and_get_evals_to_run
388389
except ModuleNotFoundError as mnf:
389390
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
390391

391-
evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
392-
eval_metrics = []
393-
for metric_name, threshold in evaluation_criteria.items():
394-
eval_metrics.append(
395-
EvalMetric(
396-
metric_name=metric_name,
397-
threshold=threshold,
398-
judge_model_options=JudgeModelOptions(),
399-
)
400-
)
401-
402-
print(f"Using evaluation criteria: {evaluation_criteria}")
392+
eval_config = get_evaluation_criteria_or_default(config_file_path)
393+
print(f"Using evaluation criteria: {eval_config}")
394+
eval_metrics = get_eval_metrics_from_config(eval_config)
403395

404396
root_agent = get_root_agent(agent_module_file_path)
405397
app_name = os.path.basename(agent_module_file_path)
@@ -500,7 +492,9 @@ def cli_eval(
500492
except ModuleNotFoundError as mnf:
501493
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
502494

503-
print("*********************************************************************")
495+
click.echo(
496+
"*********************************************************************"
497+
)
504498
eval_run_summary = {}
505499

506500
for eval_result in eval_results:
@@ -513,20 +507,27 @@ def cli_eval(
513507
eval_run_summary[eval_result.eval_set_id][0] += 1
514508
else:
515509
eval_run_summary[eval_result.eval_set_id][1] += 1
516-
print("Eval Run Summary")
510+
click.echo("Eval Run Summary")
517511
for eval_set_id, pass_fail_count in eval_run_summary.items():
518-
print(
512+
click.echo(
519513
f"{eval_set_id}:\n Tests passed: {pass_fail_count[0]}\n Tests"
520514
f" failed: {pass_fail_count[1]}"
521515
)
522516

523517
if print_detailed_results:
524518
for eval_result in eval_results:
525519
eval_result: EvalCaseResult
526-
print(
520+
click.echo(
527521
"*********************************************************************"
528522
)
529-
print(eval_result.model_dump_json(indent=2))
523+
click.echo(
524+
eval_result.model_dump_json(
525+
indent=2,
526+
exclude_unset=True,
527+
exclude_defaults=True,
528+
exclude_none=True,
529+
)
530+
)
530531

531532

532533
def adk_services_options():
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(
10101011
10111012
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
10121013
1013-
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
1014+
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
1015+
-- --no-allow-unauthenticated --min-instances=2
10141016
"""
10151017
if verbosity:
10161018
click.secho(
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
12221224
Example:
12231225
12241226
adk deploy agent_engine --project=[project] --region=[region]
1225-
--staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
1227+
--staging_bucket=[staging_bucket] --display_name=[app_name]
1228+
path/to/my_agent
12261229
"""
12271230
try:
12281231
cli_deploy.to_agent_engine(
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(
13671370
13681371
Example:
13691372
1370-
adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
1373+
adk deploy gke --project=[project] --region=[region]
1374+
--cluster_name=[cluster_name] path/to/my_agent
13711375
"""
13721376
try:
13731377
cli_deploy.to_gke(
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import pydantic
18+
from pydantic import alias_generators
19+
20+
21+
class EvalBaseModel(pydantic.BaseModel):
22+
model_config = pydantic.ConfigDict(
23+
alias_generator=alias_generators.to_camel,
24+
populate_by_name=True,
25+
extra='forbid',
26+
)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Union
18+
19+
from pydantic import alias_generators
20+
from pydantic import BaseModel
21+
from pydantic import ConfigDict
22+
from pydantic import Field
23+
24+
from .eval_metrics import BaseCriterion
25+
from .eval_metrics import Threshold
26+
27+
28+
class EvalConfig(BaseModel):
29+
"""Configurations needed to run an Eval.
30+
31+
Allows users to specify metrics, their thresholds and other properties.
32+
"""
33+
34+
model_config = ConfigDict(
35+
alias_generator=alias_generators.to_camel,
36+
populate_by_name=True,
37+
)
38+
39+
criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
40+
default_factory=dict,
41+
description="""A dictionary that maps criterion to be used for a metric.
42+
43+
The key of the dictionary is the name of the eval metric and the value is the
44+
criterion to be used.
45+
46+
In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
47+
`final_response_match_v2` are the standard eval metric names, represented as
48+
keys in the dictionary. The values in the dictionary are the corresponding
49+
criterions. For the first two metrics, we use simple threshold as the criterion,
50+
the third one uses `LlmAsAJudgeCriterion`.
51+
{
52+
"criteria": {
53+
"tool_trajectory_avg_score": 1.0,
54+
"response_match_score": 0.5,
55+
"final_response_match_v2": {
56+
"threshold": 0.5,
57+
"judge_model_options": {
58+
"judge_model": "my favorite LLM",
59+
"num_samples": 5
60+
}
61+
}
62+
},
63+
}
64+
}
65+
""",
66+
)

0 commit comments

Comments
 (0)