diff --git a/CHANGELOG.md b/CHANGELOG.md index ce0d635b0..da4c83056 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ ## 0.13.0 (unreleased) +### Shared Metric Contract + +- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`. +- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior. +- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default. +- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths. +- Added `@scorer` support for class-based `Metric` objects. +- Added a reusable undecorated `ExactMatchMetric` and an `ExactMatchScorer` BYOB wrapper. + ### Adapter Proxy (Breaking — replaces LiteLLM) - **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies. diff --git a/examples/benchmarks/exact_match_metric_poc.py b/examples/benchmarks/exact_match_metric_poc.py new file mode 100644 index 000000000..4c31b9dc3 --- /dev/null +++ b/examples/benchmarks/exact_match_metric_poc.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Example BYOB benchmark using a class-based Metric as its scorer.""" + +from nemo_evaluator.environments.custom import benchmark, scorer +from nemo_evaluator.metrics import ExactMatchMetric +from nemo_evaluator.scorers import ExactMatchScorer +from nemo_evaluator.scoring import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + + +def _dataset() -> list[dict[str, str]]: + return [ + {"question": "What is the capital of France?", "answer": "Paris"}, + {"question": "What is 2 + 2?", "answer": "4"}, + ] + + +# Mode 1: use a preannotated scorer wrapper exported by the OSS scorer layer. +benchmark( + name="exact-match-preannotated-scorer-poc", + dataset=_dataset, + prompt="{question}", + target_field="answer", +)(ExactMatchScorer(reference="{{item.answer}}")) + + +class InlineExactMatchMetric: + type = "inline-exact-match" + + def __init__(self, *, reference: str, candidate: str | None = None) -> None: + self.reference = reference + self.candidate = candidate + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("correct")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + reference = input.row.data.get("answer") if self.reference == "{{item.answer}}" else self.reference + candidate = input.candidate.output_text if self.candidate is None else self.candidate + correct = 1.0 if candidate == reference else 0.0 + return MetricResult(outputs=[MetricOutput(name="correct", value=correct)]) + + +# Mode 2: annotate a local class at the benchmark call site, then configure it. +benchmark( + name="exact-match-inline-class-scorer-poc", + dataset=_dataset, + prompt="{question}", + target_field="answer", +)(scorer(InlineExactMatchMetric)(reference="{{item.answer}}")) + + +# Mode 3: adapt an already-configured reusable metric instance at the call site. +benchmark( + name="exact-match-metric-instance-poc", + dataset=_dataset, + prompt="{question}", + target_field="answer", +)(scorer(ExactMatchMetric(reference="{{item.answer}}"))) diff --git a/src/nemo_evaluator/__init__.py b/src/nemo_evaluator/__init__.py index ffba7adcb..5835cd8ac 100644 --- a/src/nemo_evaluator/__init__.py +++ b/src/nemo_evaluator/__init__.py @@ -16,22 +16,32 @@ __version__ = "0.12.0" +from nemo_evaluator.engine.eval_loop import run_evaluation +from nemo_evaluator.engine.model_client import ModelClient from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult from nemo_evaluator.environments.custom import benchmark, scorer from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register -from nemo_evaluator.engine.eval_loop import run_evaluation -from nemo_evaluator.engine.model_client import ModelClient -from nemo_evaluator.solvers import ( - ChatSolver, - CompletionSolver, - NatSolver, - OpenClawSolver, - Solver, - SolveResult, - VLMSolver, -) +from nemo_evaluator.metrics import ExactMatchMetric +from nemo_evaluator.scorers import ExactMatchScorer from nemo_evaluator.scoring import ( + BooleanValue, + CandidateOutput, + ContinuousScore, + DatasetRow, + DiscreteScore, + Label, + Metric, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, ScorerInput, + ScorerReturn, answer_line, code_sandbox, code_sandbox_async, @@ -40,6 +50,16 @@ multichoice_regex, needs_judge, numeric_match, + score_names_from_output_spec, +) +from nemo_evaluator.solvers import ( + ChatSolver, + CompletionSolver, + NatSolver, + OpenClawSolver, + Solver, + SolveResult, + VLMSolver, ) __all__ = [ @@ -65,6 +85,26 @@ "benchmark", "scorer", "ScorerInput", + "ExactMatchMetric", + "ExactMatchScorer", + "Metric", + "BooleanValue", + "DatasetRow", + "CandidateOutput", + "ContinuousScore", + "DiscreteScore", + "Label", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricDescriptor", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", # Scoring primitives "exact_match", "multichoice_regex", diff --git a/src/nemo_evaluator/environments/custom.py b/src/nemo_evaluator/environments/custom.py index 5d0e8cdd2..0f8d8dac0 100644 --- a/src/nemo_evaluator/environments/custom.py +++ b/src/nemo_evaluator/environments/custom.py @@ -41,16 +41,35 @@ def score(sample: ScorerInput) -> dict: import json import logging import random +from collections.abc import Mapping from dataclasses import dataclass, field from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable +from typing import TYPE_CHECKING, Any, Callable, Protocol, TypeVar, cast, overload, runtime_checkable + +from pydantic import BaseModel if TYPE_CHECKING: from nemo_evaluator.sandbox.base import Sandbox from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult -from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec from nemo_evaluator.environments.registry import register +from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec +from nemo_evaluator.scoring.metric import ( + CandidateOutput, + DatasetRow, + Metric, + MetricDescriptor, + MetricInput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, + ScorerReturn, + score_names_from_output_spec, + validate_metric_result, +) from nemo_evaluator.scoring.types import ScorerInput logger = logging.getLogger(__name__) @@ -59,10 +78,23 @@ def score(sample: ScorerInput) -> dict: # ── Data types ──────────────────────────────────────────────────────────── +ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel) +ConfigModelT = TypeVar("ConfigModelT", bound=BaseModel) +MetricClassT = TypeVar("MetricClassT", bound=type[Metric]) + + +@runtime_checkable +class _MetricScorer(Protocol): + @property + def descriptor(self) -> MetricDescriptor: ... + + def to_metric(self) -> Metric: ... + + @dataclass class BenchmarkDefinition: name: str - dataset: str | Callable[[], list[dict]] + dataset: str | Callable[..., list[dict[str, Any]]] prompt: str target_field: str = "target" endpoint_type: str = "chat" @@ -70,20 +102,50 @@ class BenchmarkDefinition: field_mapping: dict[str, str] | None = None extra: dict[str, Any] = field(default_factory=dict) requirements: list[str] | None = None - scorer_fn: Callable[[ScorerInput], dict] | None = None - prepare_row: Callable[[dict, int, random.Random], dict] | None = None - seed_fn: Callable[[dict, int], SeedResult] | None = None - image_builder_fn: Callable[[list[dict]], ImageBuildRequest] | None = None + scorer_fn: Callable[..., ScorerReturn] | _MetricScorer | None = None + prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None + seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None + image_builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest] | None = None _BYOB_REGISTRY: dict[str, BenchmarkDefinition] = {} +def _attach_metric_scorer_instance(instance: Metric) -> Metric: + object.__setattr__(instance, "descriptor", MetricDescriptor(type=instance.type, outputs=instance.output_spec())) + object.__setattr__(instance, "to_metric", lambda: instance) + return instance + + +def _decorate_metric_scorer_class(cls: MetricClassT) -> MetricClassT: + """Decorate a class-based ``Metric`` so instances can be BYOB scorers.""" + original_init = cast(Callable[..., None], cls.__init__) + + def __init__(self: Metric, *args: object, **kwargs: object) -> None: + original_init(self, *args, **kwargs) + _attach_metric_scorer_instance(self) + + scorer_cls = type( + cls.__name__, + (cls,), + { + "__doc__": cls.__doc__, + "__init__": __init__, + "__module__": cls.__module__, + "__qualname__": cls.__qualname__, + }, + ) + return cast(MetricClassT, scorer_cls) + + # ── Dataset loading ─────────────────────────────────────────────────────── -def _load_dataset_from_spec(spec: str | Callable, num_examples: int | None = None) -> list[dict[str, Any]]: - if callable(spec): +def _load_dataset_from_spec( + spec: str | Callable[..., list[dict[str, Any]]], + num_examples: int | None = None, +) -> list[dict[str, Any]]: + if not isinstance(spec, str): import inspect sig = inspect.signature(spec) @@ -148,7 +210,7 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]] return [dict(row) for row in ds] -def _format_prompt(template: str, row: dict, field_mapping: dict | None = None) -> str: +def _format_prompt(template: str, row: dict[str, Any], field_mapping: dict[str, str] | None = None) -> str: data = dict(row) if field_mapping: for src, dst in field_mapping.items(): @@ -235,27 +297,108 @@ async def verify(self, response: str, expected: str, sandbox: Sandbox | None = N import asyncio + if isinstance(self._defn.scorer_fn, _MetricScorer): + metric = self._defn.scorer_fn.to_metric() + if isinstance(metric, ScorerFunctionMetric): + metric = metric.bind_raw_config( + config=self._defn.extra, + sandbox=sandbox, + target=expected, + ) + metric_input = _metric_input_from_verify( + response=response, + metadata=meta, + ) + result = validate_metric_result(await metric.compute_scores(metric_input), metric.output_spec()) + return _metric_result_to_verify_result( + metric=metric, + result=result, + benchmark_name=self._defn.name, + response=response, + ) + sample = ScorerInput( response=response, target=expected, metadata=meta, config=self._defn.extra, sandbox=sandbox ) - scores = self._defn.scorer_fn(sample) - if asyncio.iscoroutine(scores): - scores = await scores - - reward = float(scores.get("correct", scores.get("reward", next(iter(scores.values()), 0)))) + scores_result = self._defn.scorer_fn(sample) + if asyncio.iscoroutine(scores_result): + scores_result = await scores_result + scores = cast(Mapping[str, object], scores_result) + + reward_value = scores.get("correct", scores.get("reward", next(iter(scores.values()), 0))) + reward = float(reward_value) if isinstance(reward_value, bool | int | float) else 0.0 + extracted = scores.get("extracted") return VerifyResult( reward=reward, - extracted_answer=scores.get("extracted", response.strip()[:200]), - scoring_details={"method": f"byob_{self._defn.name}", **scores}, + extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200], + scoring_details={"method": f"byob_{self._defn.name}", **dict(scores)}, ) +def _metric_input_from_verify( + *, + response: str, + metadata: dict[str, Any], +) -> MetricInput: + row_data: dict[str, object] = dict(metadata) + return MetricInput( + row=DatasetRow(data=row_data), + candidate=CandidateOutput(output_text=response), + ) + + +def _metric_result_to_verify_result( + *, + metric: Metric, + result: MetricResult, + benchmark_name: str, + response: str, +) -> VerifyResult: + outputs = {output.name: output.value for output in result.outputs} + score_names = score_names_from_output_spec(metric.output_spec()) + scores = {name: _score_value(outputs[name]) for name in score_names if name in outputs} + reward_name = _select_reward_score_name(scores=scores, declared=score_names) + extracted = outputs.get("extracted") + + scoring_details: dict[str, Any] = { + "method": f"byob_{benchmark_name}", + "metric_type": metric.type, + "outputs": outputs, + } + for name, value in outputs.items(): + scoring_details.setdefault(name, value) + + return VerifyResult( + reward=scores[reward_name] if reward_name is not None else 0.0, + extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200], + scoring_details=scoring_details, + ) + + +def _select_reward_score_name(*, scores: dict[str, float], declared: list[str]) -> str | None: + for preferred in ("reward", "correct"): + if preferred in scores: + return preferred + for name in declared: + if name in scores: + return name + return next(iter(scores), None) + + +def _score_value(value: object) -> float: + if isinstance(value, bool): + return 1.0 if value else 0.0 + if isinstance(value, int | float): + return float(value) + raise TypeError(f"Metric score output must be bool, int, or float, got {type(value).__name__}") + + # ── Decorators ──────────────────────────────────────────────────────────── def benchmark( name: str, - dataset: str | Callable, + dataset: str | Callable[..., list[dict[str, Any]]], prompt: str = "", target_field: str = "target", endpoint_type: str = "chat", @@ -263,9 +406,9 @@ def benchmark( field_mapping: dict[str, str] | None = None, extra: dict[str, Any] | None = None, requirements: list[str] | None = None, - prepare_row: Callable | None = None, - seed_fn: Callable | None = None, - **kwargs, + prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None, + seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None, + **kwargs: Any, ): """Register a benchmark. Decorate a scorer function.""" defn = BenchmarkDefinition( @@ -300,13 +443,156 @@ def __init__(self, num_examples: int | None = None): return decorator -def scorer(fn: Callable[[ScorerInput], dict]) -> Callable[[ScorerInput], dict]: - """Marks a function as a scorer.""" - fn._is_scorer = True # type: ignore[attr-defined] - return fn +@overload +def scorer( + fn: None = None, + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: type[ConfigModelT], +) -> Callable[[ScorerCallable[ConfigModelT]], MetricScorerFunction[ConfigModelT]]: ... + + +@overload +def scorer( + fn: ScorerCallable[ConfigModelT], + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: type[ConfigModelT], +) -> MetricScorerFunction[ConfigModelT]: ... -def image_builder(builder_fn: Callable[[list[dict]], ImageBuildRequest]): +@overload +def scorer( + fn: None = None, + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: None = None, +) -> Callable[[ScorerCallable[ConfigT]], MetricScorerFunction[ConfigT]]: ... + + +@overload +def scorer( + fn: ScorerCallable[ConfigT], + *, + metric_type: str, + outputs: list[MetricOutputSpec], + config_schema: None = None, +) -> MetricScorerFunction[ConfigT]: ... + + +@overload +def scorer(fn: ScorerCallable[ConfigT]) -> ScorerCallable[ConfigT]: ... + + +@overload +def scorer(fn: MetricClassT) -> MetricClassT: ... + + +@overload +def scorer(fn: Metric) -> _MetricScorer: ... + + +@overload +def scorer( + fn: None = None, + *, + metric_type: None = None, + outputs: None = None, + config_schema: None = None, +) -> Callable[[ScorerCallable[ConfigT]], ScorerCallable[ConfigT]]: ... + + +def scorer( + fn: Callable[..., ScorerReturn] | Metric | type[Metric] | None = None, + *, + metric_type: str | None = None, + outputs: list[MetricOutputSpec] | None = None, + config_schema: type[BaseModel] | None = None, +) -> object: + """Marks a function or configured class-based Metric as a scorer. + + Plain ``@scorer`` keeps the current ``ScorerInput -> dict`` behavior. + ``@scorer(metric_type=..., outputs=...)`` exposes ``descriptor`` and + ``to_metric()`` for adapting scorer functions to the shared Metric protocol. + Class-based metrics can use ``@scorer`` on the class, then pass configured + metric instances directly to ``@benchmark``. + """ + if isinstance(fn, type): + if any(option is not None for option in (metric_type, outputs, config_schema)): + raise ValueError("class-based Metric scorers do not accept @scorer metric contract options") + return _decorate_metric_scorer_class(cast(type[Metric], fn)) + + if fn is not None and isinstance(fn, Metric): + if any(option is not None for option in (metric_type, outputs, config_schema)): + raise ValueError("class-based Metric scorer instances do not accept @scorer metric contract options") + return _attach_metric_scorer_instance(fn) + + if fn is not None and not callable(fn): + raise TypeError("class-based Metric scorer instances must implement the Metric protocol") + + if outputs is None and (metric_type is not None or config_schema is not None): + metric_options = [ + option + for option, value in ( + ("metric_type=...", metric_type), + ("config_schema=...", config_schema), + ) + if value is not None + ] + raise ValueError( + f"@scorer({', '.join(metric_options)}) opts into the Metric contract, but no outputs were declared. " + "Pass outputs=[MetricOutputSpec(...)] so the metric descriptor can declare and validate outputs." + ) + if outputs is not None and metric_type is None: + raise ValueError( + "@scorer(outputs=...) opts into the Metric contract, but no metric_type was declared. " + "Pass metric_type='...' so the metric has a stable identity across refactors." + ) + + def decorate(fn: Callable[..., ScorerReturn]) -> object: + return _decorate_scorer( + cast(ScorerCallable[ScorerConfig], fn), + metric_type=metric_type, + outputs=outputs, + config_schema=config_schema, + ) + + return decorate(cast(Callable[..., ScorerReturn], fn)) if fn is not None else decorate + + +def _decorate_scorer( + fn: ScorerCallable[ConfigT], + *, + metric_type: str | None = None, + outputs: list[MetricOutputSpec] | None = None, + config_schema: type[BaseModel] | None = None, +): + setattr(fn, "_is_scorer", True) + if outputs is None: + return fn + if metric_type is None: + raise ValueError("metric_type is required when outputs are declared") + + descriptor = MetricDescriptor( + type=metric_type, + outputs=outputs, + config_schema=config_schema, + ) + + def to_metric() -> ScorerFunctionMetric[ConfigT]: + return ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=fn, + ) + + setattr(fn, "descriptor", descriptor) + setattr(fn, "to_metric", to_metric) + return fn + +def image_builder(builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest]): """Declare images that need building, stacked with ``@benchmark``. ``builder_fn`` receives the dataset rows and returns an diff --git a/src/nemo_evaluator/metrics/__init__.py b/src/nemo_evaluator/metrics/__init__.py index a40ca42fd..d13b6fb50 100644 --- a/src/nemo_evaluator/metrics/__init__.py +++ b/src/nemo_evaluator/metrics/__init__.py @@ -27,8 +27,10 @@ permutation_test, sign_test, ) +from nemo_evaluator.metrics.exact_match import ExactMatchMetric __all__ = [ + "ExactMatchMetric", "McNemarResult", "POWER_80_FACTOR", "PermutationResult", diff --git a/src/nemo_evaluator/metrics/exact_match.py b/src/nemo_evaluator/metrics/exact_match.py new file mode 100644 index 000000000..8cd4beb1d --- /dev/null +++ b/src/nemo_evaluator/metrics/exact_match.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class-based exact-match metric implementation.""" + +from __future__ import annotations + +import re +import string +from typing import Literal + +from jinja2 import Environment, StrictUndefined +from pydantic import BaseModel, ConfigDict, Field + +from nemo_evaluator.scoring import MetricInput, MetricOutput, MetricOutputSpec, MetricResult + +_JINJA_ENV = Environment(undefined=StrictUndefined, autoescape=False) + +__all__ = ["ExactMatchMetric"] + + +class ExactMatchMetric(BaseModel): + """Exact-match metric using the shared MetricInput -> MetricResult contract.""" + + model_config = ConfigDict(extra="forbid") + + reference: str = Field(description="Jinja template for the expected reference answer.") + candidate: str | None = Field( + default=None, + description="Optional Jinja template for the candidate. Defaults to sample.output_text.", + ) + type: Literal["exact-match"] = Field(default="exact-match", description="Stable metric type identifier.") + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("correct")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + item, sample = _template_payload_from_metric_input(input) + context = _build_template_context(item, sample) + reference = _render_template(self.reference, context) + candidate = _render_template(self.candidate, context) if self.candidate is not None else sample.get("output_text") + if not isinstance(reference, str): + raise TypeError("ExactMatchMetric reference must render to a string.") + if not isinstance(candidate, str): + raise TypeError("ExactMatchMetric candidate must render to a string.") + correct = 1.0 if _normalize(candidate) == _normalize(reference) else 0.0 + return MetricResult(outputs=[MetricOutput(name="correct", value=correct)]) + + +def _template_payload_from_metric_input(input: MetricInput) -> tuple[dict[str, object], dict[str, object]]: + item = dict(input.row.data) + sample = dict(input.candidate.metadata) + if input.candidate.output_text is not None: + sample["output_text"] = input.candidate.output_text + if input.candidate.response is not None: + sample["response"] = input.candidate.response + if input.candidate.trajectory is not None: + sample["trajectory"] = input.candidate.trajectory + return item, sample + + +def _build_template_context(item: dict[str, object], sample: dict[str, object]) -> dict[str, object]: + return {**item, **sample, "item": item, "sample": sample} + + +def _render_template(template: str, context: dict[str, object]) -> str: + return _JINJA_ENV.from_string(template).render(context) + + +def _normalize(value: str) -> str: + value = value.lower() + value = re.sub(r"\b(a|an|the)\b", " ", value) + value = "".join(ch for ch in value if ch not in set(string.punctuation)) + return " ".join(value.split()) diff --git a/src/nemo_evaluator/scorers.py b/src/nemo_evaluator/scorers.py new file mode 100644 index 000000000..3d60739c7 --- /dev/null +++ b/src/nemo_evaluator/scorers.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""BYOB scorer adapters for reusable Metric implementations.""" + +from __future__ import annotations + +from nemo_evaluator.environments.custom import scorer +from nemo_evaluator.metrics import ExactMatchMetric + +__all__ = ["ExactMatchScorer"] + + +ExactMatchScorer = scorer(ExactMatchMetric) diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py index 3ca193cf7..7aea45602 100644 --- a/src/nemo_evaluator/scoring/__init__.py +++ b/src/nemo_evaluator/scoring/__init__.py @@ -22,6 +22,8 @@ from typing import Callable +from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async +from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema from nemo_evaluator.scoring.judge import ( JudgeScoringConfig, build_judge_prompt, @@ -29,9 +31,27 @@ needs_judge, parse_judge_response, ) -from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema +from nemo_evaluator.scoring.metric import ( + BooleanValue, + CandidateOutput, + ContinuousScore, + DatasetRow, + DiscreteScore, + Label, + Metric, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerCallable, + ScorerConfig, + ScorerFunctionMetric, + ScorerReturn, + score_names_from_output_spec, +) from nemo_evaluator.scoring.pattern import answer_line, multichoice_regex, numeric_match -from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async from nemo_evaluator.scoring.text import exact_match, extract_mcq_letter, fuzzy_match from nemo_evaluator.scoring.types import ScorerInput @@ -65,6 +85,24 @@ def list_scorers() -> list[str]: __all__ = [ "ScorerInput", + "Metric", + "BooleanValue", + "DatasetRow", + "ContinuousScore", + "CandidateOutput", + "DiscreteScore", + "Label", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricDescriptor", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", "get_scorer", "list_scorers", # Text diff --git a/src/nemo_evaluator/scoring/metric.py b/src/nemo_evaluator/scoring/metric.py new file mode 100644 index 000000000..c4bf72d20 --- /dev/null +++ b/src/nemo_evaluator/scoring/metric.py @@ -0,0 +1,368 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Shared MetricInput -> MetricResult runtime contract.""" + +from __future__ import annotations + +import inspect +from collections.abc import Awaitable, Callable, Mapping +from typing import Generic, Protocol, TypeVar, cast, runtime_checkable + +from pydantic import BaseModel, ConfigDict, Field, RootModel, SkipValidation, field_validator, model_validator + +from nemo_evaluator.sandbox.base import Sandbox +from nemo_evaluator.scoring.types import ScorerInput + +ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel) +SchemaT = TypeVar("SchemaT", bound=BaseModel) + + +class DatasetRow(BaseModel): + """Original benchmark dataset row plus optional stable row identity.""" + + model_config = ConfigDict(extra="forbid") + + row_index: int | None = None + data: dict[str, object] + + +class CandidateOutput(BaseModel): + """Candidate output being scored for one dataset row.""" + + model_config = ConfigDict(extra="forbid") + + output_text: str | None = None + response: object | None = None + trajectory: object | None = None + metadata: dict[str, object] = Field(default_factory=dict) + + +class MetricInput(BaseModel): + """Complete per-row scoring input passed to a metric.""" + + model_config = ConfigDict(extra="forbid") + + row: DatasetRow + candidate: CandidateOutput + + +class ContinuousScore(RootModel[float]): + """Continuous numeric metric value.""" + + +class DiscreteScore(RootModel[int]): + """Discrete numeric metric value.""" + + +class Label(RootModel[str]): + """String label metric value.""" + + +class BooleanValue(RootModel[bool]): + """Boolean metric value.""" + + +class MetricOutputSpec(BaseModel, Generic[SchemaT]): + """Schema for one named value emitted by a metric.""" + + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + + name: str + description: str | None = None + value_schema: type[SchemaT] + + @field_validator("name") + @classmethod + def _name_must_not_be_empty(cls, value: str) -> str: + if not value: + raise ValueError("metric output name must not be empty") + return value + + @staticmethod + def continuous_score(name: str, description: str | None = None) -> MetricOutputSpec[ContinuousScore]: + return MetricOutputSpec[ContinuousScore](name=name, description=description, value_schema=ContinuousScore) + + @staticmethod + def discrete_score(name: str, description: str | None = None) -> MetricOutputSpec[DiscreteScore]: + return MetricOutputSpec[DiscreteScore](name=name, description=description, value_schema=DiscreteScore) + + @staticmethod + def label(name: str, description: str | None = None) -> MetricOutputSpec[Label]: + return MetricOutputSpec[Label](name=name, description=description, value_schema=Label) + + @staticmethod + def boolean(name: str, description: str | None = None) -> MetricOutputSpec[BooleanValue]: + return MetricOutputSpec[BooleanValue](name=name, description=description, value_schema=BooleanValue) + + @staticmethod + def model( + name: str, + value_schema: type[SchemaT], + description: str | None = None, + ) -> MetricOutputSpec[SchemaT]: + return MetricOutputSpec[SchemaT](name=name, description=description, value_schema=value_schema) + + def coerce_value(self, value: object) -> SchemaT: + """Validate and coerce a raw output value to this spec's declared schema.""" + return self.value_schema.model_validate(value) + + def coerce_output(self, output: MetricOutput) -> SchemaT: + """Validate and coerce a named metric output against this spec.""" + if output.name != self.name: + raise ValueError(f"Expected metric output {self.name!r}, got {output.name!r}") + return self.coerce_value(output.value) + + def value_json_schema(self) -> dict[str, object]: + return self.value_schema.model_json_schema() + + +class MetricDescriptor(BaseModel): + """Metadata needed to materialize a decorated scorer as a Metric.""" + + model_config = ConfigDict(extra="forbid") + + type: str + outputs: list[MetricOutputSpec] = Field(min_length=1) + config_schema: type[BaseModel] | None = None + + @field_validator("type") + @classmethod + def _type_must_not_be_empty(cls, value: str) -> str: + if not value: + raise ValueError("metric type must not be empty") + return value + + + @field_validator("outputs") + @classmethod + def _output_names_must_be_unique( + cls, value: list[MetricOutputSpec] + ) -> list[MetricOutputSpec]: + names = [output.name for output in value] + duplicates = sorted({name for name in names if names.count(name) > 1}) + if duplicates: + raise ValueError(f"duplicate metric output names: {duplicates}") + return value + + +class MetricOutput(BaseModel): + """One named value emitted by a metric.""" + + model_config = ConfigDict(extra="forbid") + + name: str + value: object + + +class MetricResult(BaseModel): + """Structured row-level metric result.""" + + model_config = ConfigDict(extra="forbid") + + outputs: list[MetricOutput] + + +@runtime_checkable +class Metric(Protocol): + """Shared row-scoring primitive.""" + + @property + def type(self) -> str: ... + + def output_spec(self) -> list[MetricOutputSpec]: ... + + async def compute_scores(self, input: MetricInput) -> MetricResult: ... + + +ScorerReturn = Mapping[str, object] | Awaitable[Mapping[str, object]] +ScorerCallable = Callable[[ScorerInput[ConfigT]], ScorerReturn] +ScorerConfig = Mapping[str, object] | BaseModel + + +class MetricScorerFunction(Protocol[ConfigT]): + """Decorated scorer function that can be materialized as a metric.""" + + @property + def descriptor(self) -> MetricDescriptor: ... + + def __call__(self, sample: ScorerInput[ConfigT]) -> ScorerReturn: ... + + def to_metric(self) -> ScorerFunctionMetric[ConfigT]: ... + + +class ScorerFunctionMetric(BaseModel, Generic[ConfigT]): + """Metric adapter for decorator-authored OSS ScorerInput -> dict scorers.""" + + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) + + descriptor: MetricDescriptor + scorer_fn: ScorerCallable[ConfigT] + config: ConfigT | None = None + sandbox: SkipValidation[Sandbox] | None = None + target: object | None = None + target_field: str = "target" + + @model_validator(mode="after") + def _validate_bound_config(self) -> ScorerFunctionMetric[ConfigT]: + if self.config is not None: + self.config = self._validate_config(self.config) + return self + + @property + def type(self) -> str: + return self.descriptor.type + + def bind( + self, + *, + config: ConfigT | None = None, + sandbox: "Sandbox | None" = None, + target: object | None = None, + target_field: str | None = None, + ) -> ScorerFunctionMetric[ConfigT]: + validated_config = self.config if config is None else self._validate_config(config) + return self.model_copy( + update={ + "config": validated_config, + "sandbox": self.sandbox if sandbox is None else sandbox, + "target": self.target if target is None else target, + "target_field": self.target_field if target_field is None else target_field, + } + ) + + def bind_raw_config( + self, + *, + config: ScorerConfig | None = None, + sandbox: "Sandbox | None" = None, + target: object | None = None, + target_field: str | None = None, + ) -> ScorerFunctionMetric[ConfigT]: + """Bind dict-like runtime config, validating it against ``config_schema`` when present.""" + validated_config = self.config if config is None else self._validate_config(config, coerce=True) + return self.model_copy( + update={ + "config": validated_config, + "sandbox": self.sandbox if sandbox is None else sandbox, + "target": self.target if target is None else target, + "target_field": self.target_field if target_field is None else target_field, + } + ) + + def output_spec(self) -> list[MetricOutputSpec]: + return self.descriptor.outputs + + async def compute_scores(self, input: MetricInput) -> MetricResult: + sample: ScorerInput[ConfigT] = ScorerInput( + response=input.candidate.output_text or "", + target=self.target if self.target is not None else input.row.data.get(self.target_field), + metadata=dict(input.row.data), + config=cast(ConfigT, self._resolve_config()), + sandbox=self.sandbox, + ) + result = self.scorer_fn(sample) + if inspect.isawaitable(result): + result = await result + if not isinstance(result, Mapping): + raise TypeError(f"scorer_fn must return a mapping, got {type(result).__name__}") + metric_result = MetricResult( + outputs=[MetricOutput(name=name, value=value) for name, value in cast(Mapping[str, object], result).items()] + ) + return validate_metric_result(metric_result, self.descriptor.outputs) + + def _validate_config( + self, config: ConfigT | ScorerConfig, *, coerce: bool = False + ) -> ConfigT: + schema = self.descriptor.config_schema + if schema is None: + if isinstance(config, BaseModel): + return cast(ConfigT, config.model_dump()) + return cast(ConfigT, dict(cast(Mapping[str, object], config))) + if isinstance(config, schema): + return cast(ConfigT, config) + if not coerce: + raise TypeError( + f"config must be an instance of {schema.__name__}; " + "use bind_raw_config(...) to validate dict-like runtime config" + ) + payload = ( + cast(Mapping[str, object], config.model_dump()) + if isinstance(config, BaseModel) + else dict(cast(Mapping[str, object], config)) + ) + return cast(ConfigT, schema.model_validate(payload)) + + def _resolve_config(self) -> ConfigT | Mapping[str, object]: + if self.config is not None: + return self.config + schema = self.descriptor.config_schema + if schema is None: + return {} + return cast(ConfigT, schema.model_validate({})) + + +def validate_metric_result(result: MetricResult, outputs: list[MetricOutputSpec]) -> MetricResult: + """Validate a metric result against its declared outputs.""" + returned_names = [output.name for output in result.outputs] + duplicates = sorted({name for name in returned_names if returned_names.count(name) > 1}) + if duplicates: + raise ValueError(f"Duplicate metric output names: {duplicates}") + + outputs_by_name = {output.name: output for output in outputs} + declared_names = [output.name for output in outputs] + declared = set(declared_names) + returned = set(returned_names) + missing = [name for name in declared_names if name not in returned] + undeclared = [name for name in returned_names if name not in declared] + + if missing: + raise ValueError(f"Missing declared metric outputs: {missing}") + if undeclared: + raise ValueError(f"Undeclared metric outputs: {undeclared}") + for output in result.outputs: + outputs_by_name[output.name].coerce_output(output) + return result + + +def score_names_from_output_spec(outputs: list[MetricOutputSpec]) -> list[str]: + """Return declared numeric score names from metric output specs.""" + return [ + output.name + for output in outputs + if issubclass(output.value_schema, ContinuousScore | DiscreteScore | BooleanValue) + ] + + +__all__ = [ + "BooleanValue", + "CandidateOutput", + "ContinuousScore", + "DatasetRow", + "DiscreteScore", + "Label", + "Metric", + "MetricDescriptor", + "MetricInput", + "MetricOutput", + "MetricOutputSpec", + "MetricResult", + "MetricScorerFunction", + "ScorerCallable", + "ScorerConfig", + "ScorerFunctionMetric", + "ScorerReturn", + "score_names_from_output_spec", + "validate_metric_result", +] diff --git a/src/nemo_evaluator/scoring/types.py b/src/nemo_evaluator/scoring/types.py index fa2426a5a..fe3a7fa43 100644 --- a/src/nemo_evaluator/scoring/types.py +++ b/src/nemo_evaluator/scoring/types.py @@ -17,14 +17,17 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Generic, TypeVar if TYPE_CHECKING: from nemo_evaluator.sandbox.base import Sandbox +ConfigT = TypeVar("ConfigT") + + @dataclass -class ScorerInput: +class ScorerInput(Generic[ConfigT]): """Input passed to scorer functions. The ``sandbox`` field is available for scorers that need to inspect or @@ -36,5 +39,5 @@ class ScorerInput: response: str target: Any metadata: dict[str, Any] = field(default_factory=dict) - config: dict[str, Any] = field(default_factory=dict) + config: ConfigT = field(default_factory=dict) sandbox: Sandbox | None = None diff --git a/tests/test_environments/test_custom_metric_contract.py b/tests/test_environments/test_custom_metric_contract.py new file mode 100644 index 000000000..27386194c --- /dev/null +++ b/tests/test_environments/test_custom_metric_contract.py @@ -0,0 +1,230 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for BYOB scorer compatibility with the shared metric contract.""" + +from __future__ import annotations + +from typing import Protocol, cast + +import pytest +from pydantic import BaseModel + +from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer +from nemo_evaluator.metrics import ExactMatchMetric +from nemo_evaluator.scorers import ExactMatchScorer +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import Metric, MetricDescriptor, MetricInput, MetricOutputSpec, MetricResult +from nemo_evaluator.sandbox.base import Sandbox + + +def _dataset() -> list[dict[str, object]]: + return [{"question": "2+2", "answer": "4", "category": "math"}] + + +class ThresholdConfig(BaseModel): + threshold: float + + +class _MetricScorerForTest(Protocol): + @property + def descriptor(self) -> MetricDescriptor: ... + + def to_metric(self) -> Metric: ... + + +@pytest.mark.asyncio +async def test_plain_scorer_decorator_keeps_current_dict_path() -> None: + @scorer + def plain_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + assert sample.metadata["category"] == "math" + assert sample.config["tolerance"] == "exact" + return {"correct": True, "extracted": "4", "label": "exact"} + + env = ByobEnvironment( + BenchmarkDefinition( + name="plain_contract", + dataset=_dataset, + prompt="{question}", + target_field="answer", + extra={"tolerance": "exact"}, + scorer_fn=plain_scorer, + ) + ) + + result = await env.verify("4", "4", category="math") + + assert result.reward == 1.0 + assert result.extracted_answer == "4" + assert result.scoring_details == { + "method": "byob_plain_contract", + "correct": True, + "extracted": "4", + "label": "exact", + } + + +@pytest.mark.asyncio +async def test_typed_scorer_runs_as_metric_through_byob_verify() -> None: + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.continuous_score("format"), + MetricOutputSpec.label("judge_label"), + MetricOutputSpec.label("extracted"), + ] + sandbox = cast(Sandbox, object()) + + @scorer(metric_type="tests.typed_byob", outputs=outputs, config_schema=ThresholdConfig) + async def typed_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + assert "answer" not in sample.metadata + assert sample.metadata["category"] == "math" + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.75 + assert sample.sandbox is sandbox + return {"reward": sample.config.threshold, "format": 1.0, "judge_label": "partial", "extracted": "4"} + + env = ByobEnvironment( + BenchmarkDefinition( + name="typed_contract", + dataset=_dataset, + prompt="{question}", + target_field="answer", + extra={"threshold": "0.75"}, + scorer_fn=typed_scorer, + ) + ) + + result = await env.verify("4", "4", sandbox=sandbox, category="math") + + assert result.reward == 0.75 + assert result.extracted_answer == "4" + assert result.scoring_details == { + "method": "byob_typed_contract", + "metric_type": "tests.typed_byob", + "outputs": {"reward": 0.75, "format": 1.0, "judge_label": "partial", "extracted": "4"}, + "reward": 0.75, + "format": 1.0, + "judge_label": "partial", + "extracted": "4", + } + + +@pytest.mark.asyncio +async def test_preannotated_metric_scorer_runs_through_byob_verify() -> None: + env = ByobEnvironment( + BenchmarkDefinition( + name="exact_match_metric", + dataset=_dataset, + prompt="{question}", + target_field="answer", + scorer_fn=cast(_MetricScorerForTest, ExactMatchScorer(reference="{{item.answer}}")), + ) + ) + + matched = await env.verify("4", "4", category="math", answer="4") + mismatched = await env.verify("5", "4", category="math", answer="4") + + assert matched.reward == 1.0 + assert matched.scoring_details == { + "method": "byob_exact_match_metric", + "metric_type": "exact-match", + "outputs": {"correct": 1.0}, + "correct": 1.0, + } + assert mismatched.reward == 0.0 + assert mismatched.scoring_details["outputs"] == {"correct": 0.0} + + +@pytest.mark.asyncio +async def test_configured_metric_instance_runs_through_byob_verify() -> None: + env = ByobEnvironment( + BenchmarkDefinition( + name="exact_match_metric_instance", + dataset=_dataset, + prompt="{question}", + target_field="answer", + scorer_fn=scorer(ExactMatchMetric(reference="{{item.answer}}")), + ) + ) + + result = await env.verify("4", "4", category="math", answer="4") + + assert result.reward == 1.0 + assert result.scoring_details["metric_type"] == "exact-match" + assert result.scoring_details["outputs"] == {"correct": 1.0} + + +@pytest.mark.asyncio +async def test_class_based_metric_result_is_validated_by_byob_verify() -> None: + @scorer + class MissingOutputMetric: + type = "tests.missing_output" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("correct")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + return MetricResult(outputs=[]) + + env = ByobEnvironment( + BenchmarkDefinition( + name="missing_output_metric", + dataset=_dataset, + prompt="{question}", + target_field="answer", + scorer_fn=cast(_MetricScorerForTest, MissingOutputMetric()), + ) + ) + + with pytest.raises(ValueError, match="Missing declared metric outputs"): + await env.verify("4", "4", category="math") + + +@pytest.mark.parametrize( + ("score_names", "score_values", "expected_reward"), + [ + (["reward", "correct"], {"reward": 0.2, "correct": 1.0}, 0.2), + (["quality", "correct"], {"quality": 0.2, "correct": 1.0}, 1.0), + (["quality", "format"], {"quality": 0.4, "format": 1.0}, 0.4), + ], +) +@pytest.mark.asyncio +async def test_typed_scorer_reward_selection( + score_names: list[str], score_values: dict[str, float], expected_reward: float +) -> None: + outputs = [MetricOutputSpec.continuous_score(name) for name in score_names] + + @scorer(metric_type=f"tests.reward_selection.{score_names[0]}", outputs=outputs) + def typed_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "4" + assert sample.target == "4" + return {name: score_values[name] for name in score_names} + + env = ByobEnvironment( + BenchmarkDefinition( + name=f"typed_reward_{score_names[0]}", + dataset=_dataset, + prompt="{question}", + target_field="answer", + scorer_fn=typed_scorer, + ) + ) + + result = await env.verify("4", "4", category="math") + + assert result.reward == expected_reward diff --git a/tests/test_integration/test_eval_loop_integration.py b/tests/test_integration/test_eval_loop_integration.py index d96098c24..6f8da2588 100644 --- a/tests/test_integration/test_eval_loop_integration.py +++ b/tests/test_integration/test_eval_loop_integration.py @@ -15,11 +15,19 @@ """Integration tests: run_evaluation end-to-end with mock solver.""" import asyncio +from typing import Any from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult +from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, benchmark, scorer +from nemo_evaluator.environments.registry import get_environment from nemo_evaluator.engine.eval_loop import run_evaluation +from nemo_evaluator.metrics import ExactMatchMetric +from nemo_evaluator.scorers import ExactMatchScorer from nemo_evaluator.observability.types import ModelResponse +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import MetricOutputSpec +from nemo_evaluator.sandbox.base import Sandbox from nemo_evaluator.solvers import SolveResult @@ -38,7 +46,9 @@ async def seed(self, idx): r = self._dataset[idx] return SeedResult(prompt=r["q"], expected_answer=r["a"], metadata={"idx": idx}) - async def verify(self, response, expected, **meta): + async def verify( + self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any + ) -> VerifyResult: correct = response.strip() == expected.strip() return VerifyResult( reward=1.0 if correct else 0.0, extracted_answer=response.strip(), scoring_details={"method": "exact"} @@ -118,7 +128,7 @@ async def tracking_close(): closed.append(True) await original_close() - env.close = tracking_close + env.close = tracking_close # type: ignore[method-assign] # ty: ignore[invalid-assignment] solver = _MockSolver() asyncio.run(run_evaluation(env, solver, n_repeats=1)) assert closed, "env.close() was not called" @@ -191,6 +201,87 @@ def test_concurrent_execution(self): results = bundle["_results"] assert len(results) == 6 + def test_typed_byob_metric_result_preserved_in_results(self): + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.continuous_score("format"), + MetricOutputSpec.label("judge_label"), + MetricOutputSpec.label("rationale"), + ] + + @scorer(metric_type="tests.eval_loop_typed", outputs=outputs) + def typed_scorer(sample: ScorerInput) -> dict[str, object]: + matched = sample.response == sample.target + return { + "reward": 0.8 if matched else 0.0, + "format": 1.0, + "judge_label": "pass", + "rationale": "answer matched", + } + + env = ByobEnvironment( + BenchmarkDefinition( + name="typed_eval_loop", + dataset=lambda: [{"question": "1+1", "answer": "2"}], + prompt="{question}", + target_field="answer", + scorer_fn=typed_scorer, + ) + ) + solver = _MockSolver() + + bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1)) + + result = bundle["_results"][0] + assert result["reward"] == 0.8 + assert result["scoring_details"]["reward"] == 0.8 + assert result["scoring_details"]["format"] == 1.0 + assert result["scoring_details"]["outputs"] == { + "reward": 0.8, + "format": 1.0, + "judge_label": "pass", + "rationale": "answer matched", + } + assert result["scoring_details"]["judge_label"] == "pass" + assert result["scoring_details"]["rationale"] == "answer matched" + assert result["scoring_details"]["metric_type"] == "tests.eval_loop_typed" + + def test_preannotated_metric_scorer_result_preserved_in_results(self): + benchmark( + name="exact_match_metric_eval_loop_adapter", + dataset=lambda: [{"question": "1+1", "answer": "2"}], + prompt="{question}", + target_field="answer", + )(ExactMatchScorer(reference="{{item.answer}}")) + + env = get_environment("exact_match_metric_eval_loop_adapter") + solver = _MockSolver() + + bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1)) + + result = bundle["_results"][0] + assert result["reward"] == 1.0 + assert result["scoring_details"]["metric_type"] == "exact-match" + assert result["scoring_details"]["outputs"] == {"correct": 1.0} + + def test_configured_metric_instance_result_preserved_in_results(self): + benchmark( + name="exact_match_metric_instance_eval_loop_adapter", + dataset=lambda: [{"question": "1+1", "answer": "2"}], + prompt="{question}", + target_field="answer", + )(scorer(ExactMatchMetric(reference="{{item.answer}}"))) + + env = get_environment("exact_match_metric_instance_eval_loop_adapter") + solver = _MockSolver() + + bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1)) + + result = bundle["_results"][0] + assert result["reward"] == 1.0 + assert result["scoring_details"]["metric_type"] == "exact-match" + assert result["scoring_details"]["outputs"] == {"correct": 1.0} + class _MockSolverWithTrajectory: """Always correct; returns a non-empty trajectory so we can assert it survives resume.""" diff --git a/tests/test_scoring/test_metric_contract.py b/tests/test_scoring/test_metric_contract.py new file mode 100644 index 000000000..3e094a487 --- /dev/null +++ b/tests/test_scoring/test_metric_contract.py @@ -0,0 +1,542 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the shared MetricInput -> MetricResult contract.""" + +from __future__ import annotations + +from typing import Protocol, cast + +import pytest +from pydantic import BaseModel, ValidationError + +from nemo_evaluator.environments.custom import scorer +from nemo_evaluator.metrics import ExactMatchMetric +from nemo_evaluator.scorers import ExactMatchScorer +from nemo_evaluator.scoring import ScorerInput +from nemo_evaluator.scoring.metric import ( + CandidateOutput, + ContinuousScore, + DatasetRow, + Label, + Metric, + MetricDescriptor, + MetricInput, + MetricOutput, + MetricOutputSpec, + MetricResult, + MetricScorerFunction, + ScorerFunctionMetric, + score_names_from_output_spec, + validate_metric_result, +) +from nemo_evaluator.sandbox.base import Sandbox + + +class ThresholdConfig(BaseModel): + threshold: float + label: str = "pass" + + +class OtherThresholdConfig(BaseModel): + threshold: float + label: str = "other" + + +class JudgeDetails(BaseModel): + label: str + rationale: str + confidence: float + + +class _MetricScorerForTest(Protocol): + @property + def descriptor(self) -> MetricDescriptor: ... + + def to_metric(self) -> Metric: ... + + +def test_metric_input_groups_row_and_candidate() -> None: + metric_input = MetricInput( + row=DatasetRow(row_index=7, data={"answer": "Paris", "category": "geography"}), + candidate=CandidateOutput(output_text="Paris", metadata={"model": "mock"}), + ) + + assert metric_input.row.row_index == 7 + assert metric_input.candidate.output_text == "Paris" + assert metric_input.row.data["answer"] == "Paris" + assert not hasattr(metric_input, "sandbox") + assert not hasattr(metric_input, "config") + + +def test_metric_output_spec_convenience_constructors_and_json_schema() -> None: + score = MetricOutputSpec.continuous_score("reward", "Reward score") + label = MetricOutputSpec.label("judge_label") + details = MetricOutputSpec.model("judge_details", JudgeDetails) + + assert score.name == "reward" + assert score.description == "Reward score" + assert score.value_schema is ContinuousScore + assert score.value_json_schema()["type"] == "number" + assert label.value_schema is Label + assert details.value_schema is JudgeDetails + schema_properties = cast(dict[str, object], details.value_json_schema()["properties"]) + confidence_schema = cast(dict[str, object], schema_properties["confidence"]) + assert confidence_schema["type"] == "number" + + +def test_metric_output_spec_coerces_values_to_declared_schema() -> None: + reward = MetricOutputSpec.continuous_score("reward") + details = MetricOutputSpec.model("judge_details", JudgeDetails) + + coerced_reward = reward.coerce_output(MetricOutput(name="reward", value=1)) + coerced_details = details.coerce_value( + {"label": "pass", "rationale": "all checks passed", "confidence": 0.9} + ) + + assert isinstance(coerced_reward, ContinuousScore) + assert coerced_reward.root == 1.0 + assert isinstance(coerced_details, JudgeDetails) + assert coerced_details.label == "pass" + + with pytest.raises(ValueError, match="Expected metric output"): + reward.coerce_output(MetricOutput(name="other", value=1)) + + +@pytest.mark.asyncio +async def test_scorer_function_metric_adapts_dict_return_to_metric_outputs() -> None: + outputs = [ + MetricOutputSpec.boolean("correct"), + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.discrete_score("attempts"), + MetricOutputSpec.label("extracted"), + MetricOutputSpec.model("judge", JudgeDetails), + ] + descriptor = MetricDescriptor(type="tests.dict_adapter", outputs=outputs) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + return { + "correct": True, + "reward": 0.25, + "attempts": 2, + "extracted": "A", + "judge": {"label": "partial", "rationale": "close", "confidence": 0.5}, + } + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert {output.name: output.value for output in result.outputs} == { + "correct": True, + "reward": 0.25, + "attempts": 2, + "extracted": "A", + "judge": {"label": "partial", "rationale": "close", "confidence": 0.5}, + } + + +def test_validate_metric_result_accepts_declared_outputs() -> None: + outputs = [ + MetricOutputSpec.continuous_score("reward"), + MetricOutputSpec.boolean("correct"), + MetricOutputSpec.label("label"), + ] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=True), + MetricOutput(name="correct", value=True), + MetricOutput(name="label", value="yes"), + ] + ) + + validated = validate_metric_result(result, outputs) + + assert validated.outputs == [ + MetricOutput(name="reward", value=True), + MetricOutput(name="correct", value=True), + MetricOutput(name="label", value="yes"), + ] + + +def test_typed_scorer_decorator_exposes_config_schema() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + + @scorer(metric_type="tests.threshold", outputs=outputs, config_schema=ThresholdConfig) + def threshold_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + return {"reward": sample.config.threshold >= 0.5} + + typed_scorer: MetricScorerFunction[ThresholdConfig] = threshold_scorer + metric: ScorerFunctionMetric[ThresholdConfig] = typed_scorer.to_metric() + + assert isinstance(metric, BaseModel) + assert threshold_scorer.descriptor.config_schema is ThresholdConfig + assert metric.type == "tests.threshold" + + +def test_typed_scorer_decorator_requires_metric_type_and_outputs_for_metric_contract_options() -> None: + with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"): + scorer(metric_type="tests.missing_outputs") # type: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] + + with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"): + scorer(config_schema=ThresholdConfig) # type: ignore[reportArgumentType] # ty: ignore[invalid-argument-type] + + with pytest.raises(ValueError, match="no metric_type was declared"): + scorer(outputs=[MetricOutputSpec.continuous_score("reward")]) # type: ignore[call-overload] # ty: ignore[invalid-argument-type] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_prefers_bound_target_over_row_field() -> None: + outputs = [MetricOutputSpec.boolean("reward")] + descriptor = MetricDescriptor(type="tests.bound_target", outputs=outputs) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.target == "expected-from-verify" + assert sample.metadata["answer"] == "answer-from-row" + return {"reward": True} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + target="expected-from-verify", + target_field="answer", + ) + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "answer-from-row"}), + candidate=CandidateOutput(output_text="candidate"), + ) + ) + + assert result.outputs == [MetricOutput(name="reward", value=True)] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_accepts_typed_config_model() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.typed_config", outputs=outputs, config_schema=ThresholdConfig) + + def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.5 + assert sample.config.label == "typed" + return {"reward": sample.config.threshold} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + config=ThresholdConfig(threshold=0.5, label="typed") + ) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert result.outputs == [MetricOutput(name="reward", value=0.5)] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_validates_raw_config_against_typed_schema() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.raw_typed_config", outputs=outputs, config_schema=ThresholdConfig) + + def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]: + assert isinstance(sample.config, ThresholdConfig) + assert sample.config.threshold == 0.75 + assert sample.config.label == "pass" + return {"reward": sample.config.threshold} + + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=sync_scorer, + ).bind_raw_config(config={"threshold": "0.75"}) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert result.outputs == [MetricOutput(name="reward", value=0.75)] + + +def test_scorer_function_metric_rejects_invalid_typed_config() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.invalid_config", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=lambda sample: {"reward": True}) + + with pytest.raises(ValidationError): + metric.bind_raw_config(config={"threshold": "not-a-number"}) + + +def test_scorer_function_metric_bind_rejects_wrong_config_model_subtype() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.wrong_config_subtype", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold}, + ) + + with pytest.raises(TypeError, match="ThresholdConfig"): + metric.bind(config=OtherThresholdConfig(threshold=0.75)) + + +def test_scorer_function_metric_bind_rejects_raw_mapping_for_typed_config() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + descriptor = MetricDescriptor(type="tests.raw_config_on_typed_bind", outputs=outputs, config_schema=ThresholdConfig) + metric = ScorerFunctionMetric( + descriptor=descriptor, + scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold}, + ) + + with pytest.raises(TypeError, match="bind_raw_config"): + metric.bind(config={"threshold": 0.75}) + + +def test_validate_metric_result_rejects_duplicate_output_names() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=1.0), + MetricOutput(name="reward", value=0.0), + ] + ) + + with pytest.raises(ValueError, match="Duplicate metric output"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_missing_declared_outputs() -> None: + outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.continuous_score("format")] + result = MetricResult(outputs=[MetricOutput(name="reward", value=1.0)]) + + with pytest.raises(ValueError, match="Missing declared metric outputs"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_undeclared_outputs() -> None: + outputs = [MetricOutputSpec.continuous_score("reward")] + result = MetricResult( + outputs=[ + MetricOutput(name="reward", value=1.0), + MetricOutput(name="format", value=1.0), + ] + ) + + with pytest.raises(ValueError, match="Undeclared metric outputs"): + validate_metric_result(result, outputs) + + +def test_validate_metric_result_rejects_value_that_does_not_match_schema() -> None: + outputs = [MetricOutputSpec.model("judge_details", JudgeDetails)] + result = MetricResult(outputs=[MetricOutput(name="judge_details", value={"label": "pass"})]) + + with pytest.raises(ValidationError): + validate_metric_result(result, outputs) + + +def test_scorer_decorator_returns_subclass_without_mutating_original_metric_class() -> None: + scorer_cls = scorer(ExactMatchMetric) + + assert scorer_cls is not ExactMatchMetric + assert issubclass(scorer_cls, ExactMatchMetric) + assert not hasattr(ExactMatchMetric(reference="{{item.answer}}"), "descriptor") + assert not hasattr(ExactMatchMetric(reference="{{item.answer}}"), "to_metric") + + scorer_metric = cast(_MetricScorerForTest, scorer_cls(reference="{{item.answer}}")) + assert scorer_metric.descriptor == MetricDescriptor( + type="exact-match", + outputs=[MetricOutputSpec.continuous_score("correct")], + ) + assert scorer_metric.to_metric() is scorer_metric + + +@pytest.mark.asyncio +async def test_exact_match_metric_is_undecorated_reusable_metric() -> None: + metric = ExactMatchMetric(reference="{{item.answer}}") + + assert not hasattr(metric, "descriptor") + assert not hasattr(metric, "to_metric") + assert isinstance(metric, BaseModel) + assert metric.type == "exact-match" + assert metric.model_dump()["type"] == "exact-match" + assert metric.output_spec() == [MetricOutputSpec.continuous_score("correct")] + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "Paris"}), + candidate=CandidateOutput(output_text="Paris"), + ) + ) + + assert result.outputs == [MetricOutput(name="correct", value=1.0)] + + +@pytest.mark.asyncio +async def test_exact_match_scorer_exposes_descriptor_and_to_metric() -> None: + scorer_metric = cast(_MetricScorerForTest, ExactMatchScorer(reference="{{item.answer}}")) + + assert scorer_metric.descriptor == MetricDescriptor( + type="exact-match", + outputs=[MetricOutputSpec.continuous_score("correct")], + ) + + metric = scorer_metric.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "Paris"}), + candidate=CandidateOutput(output_text="Paris"), + ) + ) + + assert metric.type == "exact-match" + assert result.outputs == [MetricOutput(name="correct", value=1.0)] + + +@pytest.mark.asyncio +async def test_scorer_decorator_adapts_exact_match_metric_instances() -> None: + scorer_metric = scorer(ExactMatchMetric(reference="{{item.answer}}")) + + assert scorer_metric.descriptor == MetricDescriptor( + type="exact-match", + outputs=[MetricOutputSpec.continuous_score("correct")], + ) + + metric = scorer_metric.to_metric() + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "Paris"}), + candidate=CandidateOutput(output_text="Paris"), + ) + ) + + assert metric.type == "exact-match" + assert result.outputs == [MetricOutput(name="correct", value=1.0)] + + +@pytest.mark.asyncio +async def test_exact_match_metric_supports_top_level_and_sample_template_aliases() -> None: + metric = ExactMatchMetric(reference="{{answer}}", candidate="{{sample.prediction}}") + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "New York"}), + candidate=CandidateOutput(metadata={"prediction": "new york"}), + ) + ) + + assert result.outputs == [MetricOutput(name="correct", value=1.0)] + + +@pytest.mark.asyncio +async def test_class_based_scorer_to_metric_returns_configured_instance() -> None: + @scorer + class MissingOutputMetric: + type = "tests.missing_output" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("correct")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + return MetricResult(outputs=[]) + + instance = cast(_MetricScorerForTest, MissingOutputMetric()) + metric = instance.to_metric() + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate")) + ) + + assert metric is instance + assert result == MetricResult(outputs=[]) + + +def test_scorer_decorator_can_adapt_metric_instances() -> None: + class UndecoratedMetric: + type = "tests.undecorated" + + def output_spec(self) -> list[MetricOutputSpec]: + return [MetricOutputSpec.continuous_score("correct")] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + return MetricResult(outputs=[MetricOutput(name="correct", value=1.0)]) + + metric = scorer(UndecoratedMetric()) + + assert metric.descriptor == MetricDescriptor( + type="tests.undecorated", + outputs=[MetricOutputSpec.continuous_score("correct")], + ) + assert metric.to_metric() is metric + + +@pytest.mark.asyncio +async def test_scorer_function_metric_executes_sync_scorers() -> None: + outputs = [MetricOutputSpec.boolean("reward"), MetricOutputSpec.label("label")] + descriptor = MetricDescriptor(type="tests.sync_metric", outputs=outputs) + sandbox = cast(Sandbox, object()) + + def sync_scorer(sample: ScorerInput) -> dict[str, object]: + assert sample.response == "yes" + assert sample.target == "yes" + assert sample.metadata["category"] == "boolean" + assert sample.config == {"mode": "strict"} + assert sample.sandbox is sandbox + return {"reward": True, "label": "matched"} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind( + config={"mode": "strict"}, + sandbox=sandbox, + target_field="answer", + ) + + result = await metric.compute_scores( + MetricInput( + row=DatasetRow(data={"answer": "yes", "category": "boolean"}), + candidate=CandidateOutput(output_text="yes"), + ) + ) + + assert metric.type == "tests.sync_metric" + assert score_names_from_output_spec(metric.output_spec()) == ["reward"] + assert result.outputs == [MetricOutput(name="reward", value=True), MetricOutput(name="label", value="matched")] + + +@pytest.mark.asyncio +async def test_scorer_function_metric_executes_async_scorers() -> None: + outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.label("seen")] + descriptor = MetricDescriptor(type="tests.async_metric", outputs=outputs) + + async def async_scorer(sample: ScorerInput) -> dict[str, object]: + return {"reward": 0.5, "seen": sample.response} + + metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=async_scorer) + + result = await metric.compute_scores( + MetricInput(row=DatasetRow(data={"answer": "yes"}), candidate=CandidateOutput(output_text="maybe")) + ) + + assert metric.type == "tests.async_metric" + assert score_names_from_output_spec(metric.output_spec()) == ["reward"] + assert result.outputs == [MetricOutput(name="reward", value=0.5), MetricOutput(name="seen", value="maybe")] + + +def test_typed_scorer_decorator_exposes_descriptor_and_to_metric() -> None: + outputs = [MetricOutputSpec.boolean("truthful"), MetricOutputSpec.label("judge_grade")] + + @scorer(metric_type="truthfulqa", outputs=outputs) + def truthfulqa_scorer(sample: ScorerInput) -> dict[str, object]: + return {"truthful": bool(sample.response), "judge_grade": "C"} + + metric = truthfulqa_scorer.to_metric() + + assert truthfulqa_scorer.descriptor == MetricDescriptor(type="truthfulqa", outputs=outputs) + assert metric.type == "truthfulqa" + assert score_names_from_output_spec(metric.output_spec()) == ["truthful"]