google
diff --git a/‎src/google/adk/cli/cli_eval.py‎
Lines changed: 42 additions & 15 deletions b/‎src/google/adk/cli/cli_eval.py‎
Lines changed: 42 additions & 15 deletions
diff --git a/‎src/google/adk/cli/cli_tools_click.py‎
Lines changed: 24 additions & 20 deletions b/‎src/google/adk/cli/cli_tools_click.py‎
Lines changed: 24 additions & 20 deletions
diff --git a/‎src/google/adk/evaluation/common.py‎
Lines changed: 26 additions & 0 deletions b/‎src/google/adk/evaluation/common.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/eval_config.py‎
Lines changed: 66 additions & 0 deletions b/‎src/google/adk/evaluation/eval_config.py‎
Lines changed: 66 additions & 0 deletions
@@ -37,6 +37,8 @@
 from ..evaluation.base_eval_service import InferenceResult
 from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
+from ..evaluation.eval_config import BaseCriterion
+from ..evaluation.eval_config import EvalConfig
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
@@ -64,6 +66,10 @@
     RESPONSE_MATCH_SCORE_KEY: 0.8,
 }
 
+_DEFAULT_EVAL_CONFIG = EvalConfig(
+    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
+)
+
 
 def _import_from_path(module_name, file_path):
   spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):
 
 def get_evaluation_criteria_or_default(
     eval_config_file_path: str,
-) -> dict[str, float]:
-  """Returns evaluation criteria from the config file, if present.
+) -> EvalConfig:
+  """Returns EvalConfig read from the config file, if present.
 
   Otherwise a default one is returned.
   """
   if eval_config_file_path:
     with open(eval_config_file_path, "r", encoding="utf-8") as f:
-      config_data = json.load(f)
+      content = f.read()
+      return EvalConfig.model_validate_json(content)
+
+  logger.info("No config file supplied. Using default criteria.")
+  return _DEFAULT_EVAL_CONFIG
+
+
+def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
+  """Returns a list of EvalMetrics mapped from the EvalConfig."""
+  eval_metric_list = []
+  if eval_config.criteria:
+    for metric_name, criterion in eval_config.criteria.items():
+      if isinstance(criterion, float):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion,
+                criterion=BaseCriterion(threshold=criterion),
+            )
+        )
+      elif isinstance(criterion, BaseCriterion):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion.threshold,
+                criterion=criterion,
+            )
+        )
+      else:
+        raise ValueError(
+            f"Unexpected criterion type. {type(criterion).__name__} not"
+            " supported."
+        )
 
-    if "criteria" in config_data and isinstance(config_data["criteria"], dict):
-      evaluation_criteria = config_data["criteria"]
-    else:
-      raise ValueError(
-          f"Invalid format for test_config.json at {eval_config_file_path}."
-          " Expected a 'criteria' dictionary."
-      )
-  else:
-    logger.info("No config file supplied. Using default criteria.")
-    evaluation_criteria = DEFAULT_CRITERIA
-
-  return evaluation_criteria
+  return eval_metric_list
 
 
 def get_root_agent(agent_module_file_path: str) -> Agent:
 
@@ -382,24 +382,16 @@ def cli_eval(
     from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
     from .cli_eval import _collect_eval_results
     from .cli_eval import _collect_inferences
+    from .cli_eval import get_eval_metrics_from_config
     from .cli_eval import get_evaluation_criteria_or_default
     from .cli_eval import get_root_agent
     from .cli_eval import parse_and_get_evals_to_run
   except ModuleNotFoundError as mnf:
     raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
 
-  evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
-  eval_metrics = []
-  for metric_name, threshold in evaluation_criteria.items():
-    eval_metrics.append(
-        EvalMetric(
-            metric_name=metric_name,
-            threshold=threshold,
-            judge_model_options=JudgeModelOptions(),
-        )
-    )
-
-  print(f"Using evaluation criteria: {evaluation_criteria}")
+  eval_config = get_evaluation_criteria_or_default(config_file_path)
+  print(f"Using evaluation criteria: {eval_config}")
+  eval_metrics = get_eval_metrics_from_config(eval_config)
 
   root_agent = get_root_agent(agent_module_file_path)
   app_name = os.path.basename(agent_module_file_path)
@@ -500,7 +492,9 @@ def cli_eval(
   except ModuleNotFoundError as mnf:
     raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
 
-  print("*********************************************************************")
+  click.echo(
+      "*********************************************************************"
+  )
   eval_run_summary = {}
 
   for eval_result in eval_results:
@@ -513,20 +507,27 @@ def cli_eval(
       eval_run_summary[eval_result.eval_set_id][0] += 1
     else:
       eval_run_summary[eval_result.eval_set_id][1] += 1
-  print("Eval Run Summary")
+  click.echo("Eval Run Summary")
   for eval_set_id, pass_fail_count in eval_run_summary.items():
-    print(
+    click.echo(
         f"{eval_set_id}:\n  Tests passed: {pass_fail_count[0]}\n  Tests"
         f" failed: {pass_fail_count[1]}"
     )
 
   if print_detailed_results:
     for eval_result in eval_results:
       eval_result: EvalCaseResult
-      print(
+      click.echo(
           "*********************************************************************"
       )
-      print(eval_result.model_dump_json(indent=2))
+      click.echo(
+          eval_result.model_dump_json(
+              indent=2,
+              exclude_unset=True,
+              exclude_defaults=True,
+              exclude_none=True,
+          )
+      )
 
 
 def adk_services_options():
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(
 
     adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
 
-    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
+    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
+      -- --no-allow-unauthenticated --min-instances=2
   """
   if verbosity:
     click.secho(
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
   Example:
 
     adk deploy agent_engine --project=[project] --region=[region]
-      --staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
+      --staging_bucket=[staging_bucket] --display_name=[app_name]
+      path/to/my_agent
   """
   try:
     cli_deploy.to_agent_engine(
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(
 
   Example:
 
-    adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
+    adk deploy gke --project=[project] --region=[region]
+      --cluster_name=[cluster_name] path/to/my_agent
   """
   try:
     cli_deploy.to_gke(
 
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import pydantic
+from pydantic import alias_generators
+
+
+class EvalBaseModel(pydantic.BaseModel):
+  model_config = pydantic.ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+      extra='forbid',
+  )
@@ -0,0 +1,66 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Union
+
+from pydantic import alias_generators
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+
+from .eval_metrics import BaseCriterion
+from .eval_metrics import Threshold
+
+
+class EvalConfig(BaseModel):
+  """Configurations needed to run an Eval.
+
+  Allows users to specify metrics, their thresholds and other properties.
+  """
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
+      default_factory=dict,
+      description="""A dictionary that maps criterion to be used for a metric.
+
+The key of the dictionary is the name of the eval metric and the value is the
+criterion to be used.
+
+In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
+`final_response_match_v2` are the standard eval metric names, represented as
+keys in the dictionary. The values in the dictionary are the corresponding
+criterions. For the first two metrics, we use simple threshold as the criterion,
+the third one uses `LlmAsAJudgeCriterion`.
+{
+  "criteria": {
+    "tool_trajectory_avg_score": 1.0,
+    "response_match_score": 0.5,
+    "final_response_match_v2": {
+      "threshold": 0.5,
+      "judge_model_options": {
+            "judge_model": "my favorite LLM",
+            "num_samples": 5
+          }
+        }
+    },
+  }
+}
+""",
+  )