diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce0d635b0..da4c83056 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 ## 0.13.0 (unreleased)
 
+### Shared Metric Contract
+
+- Added public `MetricInput -> MetricResult` scorer/metric runtime types and `ScorerFunctionMetric`.
+- Extended BYOB `@scorer` with typed scorer metadata and `to_metric()` while preserving current dict scorer behavior.
+- Added optional `config_schema` support for typed scorer configs while keeping raw dict configs as the default.
+- Split typed scorer config binding into strict `bind(config=ConfigModel(...))` and coercive `bind_raw_config(config={...})` paths.
+- Added `@scorer` support for class-based `Metric` objects.
+- Added a reusable undecorated `ExactMatchMetric` and an `ExactMatchScorer` BYOB wrapper.
+
 ### Adapter Proxy (Breaking — replaces LiteLLM)
 
 - **LiteLLM removed**: The `litellm` dependency, `proxy` and `proxy-full` extras, and `litellm_settings` config field are all removed. The adapter proxy is now built-in with zero external proxy dependencies.
diff --git a/examples/benchmarks/exact_match_metric_poc.py b/examples/benchmarks/exact_match_metric_poc.py
new file mode 100644
index 000000000..4c31b9dc3
--- /dev/null
+++ b/examples/benchmarks/exact_match_metric_poc.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Example BYOB benchmark using a class-based Metric as its scorer."""
+
+from nemo_evaluator.environments.custom import benchmark, scorer
+from nemo_evaluator.metrics import ExactMatchMetric
+from nemo_evaluator.scorers import ExactMatchScorer
+from nemo_evaluator.scoring import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+
+def _dataset() -> list[dict[str, str]]:
+    return [
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "What is 2 + 2?", "answer": "4"},
+    ]
+
+
+# Mode 1: use a preannotated scorer wrapper exported by the OSS scorer layer.
+benchmark(
+    name="exact-match-preannotated-scorer-poc",
+    dataset=_dataset,
+    prompt="{question}",
+    target_field="answer",
+)(ExactMatchScorer(reference="{{item.answer}}"))
+
+
+class InlineExactMatchMetric:
+    type = "inline-exact-match"
+
+    def __init__(self, *, reference: str, candidate: str | None = None) -> None:
+        self.reference = reference
+        self.candidate = candidate
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("correct")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        reference = input.row.data.get("answer") if self.reference == "{{item.answer}}" else self.reference
+        candidate = input.candidate.output_text if self.candidate is None else self.candidate
+        correct = 1.0 if candidate == reference else 0.0
+        return MetricResult(outputs=[MetricOutput(name="correct", value=correct)])
+
+
+# Mode 2: annotate a local class at the benchmark call site, then configure it.
+benchmark(
+    name="exact-match-inline-class-scorer-poc",
+    dataset=_dataset,
+    prompt="{question}",
+    target_field="answer",
+)(scorer(InlineExactMatchMetric)(reference="{{item.answer}}"))
+
+
+# Mode 3: adapt an already-configured reusable metric instance at the call site.
+benchmark(
+    name="exact-match-metric-instance-poc",
+    dataset=_dataset,
+    prompt="{question}",
+    target_field="answer",
+)(scorer(ExactMatchMetric(reference="{{item.answer}}")))
diff --git a/src/nemo_evaluator/__init__.py b/src/nemo_evaluator/__init__.py
index ffba7adcb..5835cd8ac 100644
--- a/src/nemo_evaluator/__init__.py
+++ b/src/nemo_evaluator/__init__.py
@@ -16,22 +16,32 @@
 
 __version__ = "0.12.0"
 
+from nemo_evaluator.engine.eval_loop import run_evaluation
+from nemo_evaluator.engine.model_client import ModelClient
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
 from nemo_evaluator.environments.custom import benchmark, scorer
 from nemo_evaluator.environments.registry import get_environment, list_environments, load_benchmark_file, register
-from nemo_evaluator.engine.eval_loop import run_evaluation
-from nemo_evaluator.engine.model_client import ModelClient
-from nemo_evaluator.solvers import (
-    ChatSolver,
-    CompletionSolver,
-    NatSolver,
-    OpenClawSolver,
-    Solver,
-    SolveResult,
-    VLMSolver,
-)
+from nemo_evaluator.metrics import ExactMatchMetric
+from nemo_evaluator.scorers import ExactMatchScorer
 from nemo_evaluator.scoring import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    DiscreteScore,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
     ScorerInput,
+    ScorerReturn,
     answer_line,
     code_sandbox,
     code_sandbox_async,
@@ -40,6 +50,16 @@
     multichoice_regex,
     needs_judge,
     numeric_match,
+    score_names_from_output_spec,
+)
+from nemo_evaluator.solvers import (
+    ChatSolver,
+    CompletionSolver,
+    NatSolver,
+    OpenClawSolver,
+    Solver,
+    SolveResult,
+    VLMSolver,
 )
 
 __all__ = [
@@ -65,6 +85,26 @@
     "benchmark",
     "scorer",
     "ScorerInput",
+    "ExactMatchMetric",
+    "ExactMatchScorer",
+    "Metric",
+    "BooleanValue",
+    "DatasetRow",
+    "CandidateOutput",
+    "ContinuousScore",
+    "DiscreteScore",
+    "Label",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricDescriptor",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
     # Scoring primitives
     "exact_match",
     "multichoice_regex",
diff --git a/src/nemo_evaluator/environments/custom.py b/src/nemo_evaluator/environments/custom.py
index 5d0e8cdd2..0f8d8dac0 100644
--- a/src/nemo_evaluator/environments/custom.py
+++ b/src/nemo_evaluator/environments/custom.py
@@ -41,16 +41,35 @@ def score(sample: ScorerInput) -> dict:
 import json
 import logging
 import random
+from collections.abc import Mapping
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Protocol, TypeVar, cast, overload, runtime_checkable
+
+from pydantic import BaseModel
 
 if TYPE_CHECKING:
     from nemo_evaluator.sandbox.base import Sandbox
 
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
-from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec
 from nemo_evaluator.environments.registry import register
+from nemo_evaluator.sandbox.base import ImageBuildRequest, SandboxSpec
+from nemo_evaluator.scoring.metric import (
+    CandidateOutput,
+    DatasetRow,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
+    ScorerReturn,
+    score_names_from_output_spec,
+    validate_metric_result,
+)
 from nemo_evaluator.scoring.types import ScorerInput
 
 logger = logging.getLogger(__name__)
@@ -59,10 +78,23 @@ def score(sample: ScorerInput) -> dict:
 # ── Data types ────────────────────────────────────────────────────────────
 
 
+ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel)
+ConfigModelT = TypeVar("ConfigModelT", bound=BaseModel)
+MetricClassT = TypeVar("MetricClassT", bound=type[Metric])
+
+
+@runtime_checkable
+class _MetricScorer(Protocol):
+    @property
+    def descriptor(self) -> MetricDescriptor: ...
+
+    def to_metric(self) -> Metric: ...
+
+
 @dataclass
 class BenchmarkDefinition:
     name: str
-    dataset: str | Callable[[], list[dict]]
+    dataset: str | Callable[..., list[dict[str, Any]]]
     prompt: str
     target_field: str = "target"
     endpoint_type: str = "chat"
@@ -70,20 +102,50 @@ class BenchmarkDefinition:
     field_mapping: dict[str, str] | None = None
     extra: dict[str, Any] = field(default_factory=dict)
     requirements: list[str] | None = None
-    scorer_fn: Callable[[ScorerInput], dict] | None = None
-    prepare_row: Callable[[dict, int, random.Random], dict] | None = None
-    seed_fn: Callable[[dict, int], SeedResult] | None = None
-    image_builder_fn: Callable[[list[dict]], ImageBuildRequest] | None = None
+    scorer_fn: Callable[..., ScorerReturn] | _MetricScorer | None = None
+    prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None
+    seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None
+    image_builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest] | None = None
 
 
 _BYOB_REGISTRY: dict[str, BenchmarkDefinition] = {}
 
 
+def _attach_metric_scorer_instance(instance: Metric) -> Metric:
+    object.__setattr__(instance, "descriptor", MetricDescriptor(type=instance.type, outputs=instance.output_spec()))
+    object.__setattr__(instance, "to_metric", lambda: instance)
+    return instance
+
+
+def _decorate_metric_scorer_class(cls: MetricClassT) -> MetricClassT:
+    """Decorate a class-based ``Metric`` so instances can be BYOB scorers."""
+    original_init = cast(Callable[..., None], cls.__init__)
+
+    def __init__(self: Metric, *args: object, **kwargs: object) -> None:
+        original_init(self, *args, **kwargs)
+        _attach_metric_scorer_instance(self)
+
+    scorer_cls = type(
+        cls.__name__,
+        (cls,),
+        {
+            "__doc__": cls.__doc__,
+            "__init__": __init__,
+            "__module__": cls.__module__,
+            "__qualname__": cls.__qualname__,
+        },
+    )
+    return cast(MetricClassT, scorer_cls)
+
+
 # ── Dataset loading ───────────────────────────────────────────────────────
 
 
-def _load_dataset_from_spec(spec: str | Callable, num_examples: int | None = None) -> list[dict[str, Any]]:
-    if callable(spec):
+def _load_dataset_from_spec(
+    spec: str | Callable[..., list[dict[str, Any]]],
+    num_examples: int | None = None,
+) -> list[dict[str, Any]]:
+    if not isinstance(spec, str):
         import inspect
 
         sig = inspect.signature(spec)
@@ -148,7 +210,7 @@ def _load_hf(spec: str, num_examples: int | None = None) -> list[dict[str, Any]]
     return [dict(row) for row in ds]
 
 
-def _format_prompt(template: str, row: dict, field_mapping: dict | None = None) -> str:
+def _format_prompt(template: str, row: dict[str, Any], field_mapping: dict[str, str] | None = None) -> str:
     data = dict(row)
     if field_mapping:
         for src, dst in field_mapping.items():
@@ -235,27 +297,108 @@ async def verify(self, response: str, expected: str, sandbox: Sandbox | None = N
 
         import asyncio
 
+        if isinstance(self._defn.scorer_fn, _MetricScorer):
+            metric = self._defn.scorer_fn.to_metric()
+            if isinstance(metric, ScorerFunctionMetric):
+                metric = metric.bind_raw_config(
+                    config=self._defn.extra,
+                    sandbox=sandbox,
+                    target=expected,
+                )
+            metric_input = _metric_input_from_verify(
+                response=response,
+                metadata=meta,
+            )
+            result = validate_metric_result(await metric.compute_scores(metric_input), metric.output_spec())
+            return _metric_result_to_verify_result(
+                metric=metric,
+                result=result,
+                benchmark_name=self._defn.name,
+                response=response,
+            )
+
         sample = ScorerInput(
             response=response, target=expected, metadata=meta, config=self._defn.extra, sandbox=sandbox
         )
-        scores = self._defn.scorer_fn(sample)
-        if asyncio.iscoroutine(scores):
-            scores = await scores
-
-        reward = float(scores.get("correct", scores.get("reward", next(iter(scores.values()), 0))))
+        scores_result = self._defn.scorer_fn(sample)
+        if asyncio.iscoroutine(scores_result):
+            scores_result = await scores_result
+        scores = cast(Mapping[str, object], scores_result)
+
+        reward_value = scores.get("correct", scores.get("reward", next(iter(scores.values()), 0)))
+        reward = float(reward_value) if isinstance(reward_value, bool | int | float) else 0.0
+        extracted = scores.get("extracted")
         return VerifyResult(
             reward=reward,
-            extracted_answer=scores.get("extracted", response.strip()[:200]),
-            scoring_details={"method": f"byob_{self._defn.name}", **scores},
+            extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200],
+            scoring_details={"method": f"byob_{self._defn.name}", **dict(scores)},
         )
 
 
+def _metric_input_from_verify(
+    *,
+    response: str,
+    metadata: dict[str, Any],
+) -> MetricInput:
+    row_data: dict[str, object] = dict(metadata)
+    return MetricInput(
+        row=DatasetRow(data=row_data),
+        candidate=CandidateOutput(output_text=response),
+    )
+
+
+def _metric_result_to_verify_result(
+    *,
+    metric: Metric,
+    result: MetricResult,
+    benchmark_name: str,
+    response: str,
+) -> VerifyResult:
+    outputs = {output.name: output.value for output in result.outputs}
+    score_names = score_names_from_output_spec(metric.output_spec())
+    scores = {name: _score_value(outputs[name]) for name in score_names if name in outputs}
+    reward_name = _select_reward_score_name(scores=scores, declared=score_names)
+    extracted = outputs.get("extracted")
+
+    scoring_details: dict[str, Any] = {
+        "method": f"byob_{benchmark_name}",
+        "metric_type": metric.type,
+        "outputs": outputs,
+    }
+    for name, value in outputs.items():
+        scoring_details.setdefault(name, value)
+
+    return VerifyResult(
+        reward=scores[reward_name] if reward_name is not None else 0.0,
+        extracted_answer=extracted if isinstance(extracted, str) else response.strip()[:200],
+        scoring_details=scoring_details,
+    )
+
+
+def _select_reward_score_name(*, scores: dict[str, float], declared: list[str]) -> str | None:
+    for preferred in ("reward", "correct"):
+        if preferred in scores:
+            return preferred
+    for name in declared:
+        if name in scores:
+            return name
+    return next(iter(scores), None)
+
+
+def _score_value(value: object) -> float:
+    if isinstance(value, bool):
+        return 1.0 if value else 0.0
+    if isinstance(value, int | float):
+        return float(value)
+    raise TypeError(f"Metric score output must be bool, int, or float, got {type(value).__name__}")
+
+
 # ── Decorators ────────────────────────────────────────────────────────────
 
 
 def benchmark(
     name: str,
-    dataset: str | Callable,
+    dataset: str | Callable[..., list[dict[str, Any]]],
     prompt: str = "",
     target_field: str = "target",
     endpoint_type: str = "chat",
@@ -263,9 +406,9 @@ def benchmark(
     field_mapping: dict[str, str] | None = None,
     extra: dict[str, Any] | None = None,
     requirements: list[str] | None = None,
-    prepare_row: Callable | None = None,
-    seed_fn: Callable | None = None,
-    **kwargs,
+    prepare_row: Callable[[dict[str, Any], int, random.Random], dict[str, Any]] | None = None,
+    seed_fn: Callable[[dict[str, Any], int], SeedResult] | None = None,
+    **kwargs: Any,
 ):
     """Register a benchmark. Decorate a scorer function."""
     defn = BenchmarkDefinition(
@@ -300,13 +443,156 @@ def __init__(self, num_examples: int | None = None):
     return decorator
 
 
-def scorer(fn: Callable[[ScorerInput], dict]) -> Callable[[ScorerInput], dict]:
-    """Marks a function as a scorer."""
-    fn._is_scorer = True  # type: ignore[attr-defined]
-    return fn
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: type[ConfigModelT],
+) -> Callable[[ScorerCallable[ConfigModelT]], MetricScorerFunction[ConfigModelT]]: ...
+
+
+@overload
+def scorer(
+    fn: ScorerCallable[ConfigModelT],
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: type[ConfigModelT],
+) -> MetricScorerFunction[ConfigModelT]: ...
 
 
-def image_builder(builder_fn: Callable[[list[dict]], ImageBuildRequest]):
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: None = None,
+) -> Callable[[ScorerCallable[ConfigT]], MetricScorerFunction[ConfigT]]: ...
+
+
+@overload
+def scorer(
+    fn: ScorerCallable[ConfigT],
+    *,
+    metric_type: str,
+    outputs: list[MetricOutputSpec],
+    config_schema: None = None,
+) -> MetricScorerFunction[ConfigT]: ...
+
+
+@overload
+def scorer(fn: ScorerCallable[ConfigT]) -> ScorerCallable[ConfigT]: ...
+
+
+@overload
+def scorer(fn: MetricClassT) -> MetricClassT: ...
+
+
+@overload
+def scorer(fn: Metric) -> _MetricScorer: ...
+
+
+@overload
+def scorer(
+    fn: None = None,
+    *,
+    metric_type: None = None,
+    outputs: None = None,
+    config_schema: None = None,
+) -> Callable[[ScorerCallable[ConfigT]], ScorerCallable[ConfigT]]: ...
+
+
+def scorer(
+    fn: Callable[..., ScorerReturn] | Metric | type[Metric] | None = None,
+    *,
+    metric_type: str | None = None,
+    outputs: list[MetricOutputSpec] | None = None,
+    config_schema: type[BaseModel] | None = None,
+) -> object:
+    """Marks a function or configured class-based Metric as a scorer.
+
+    Plain ``@scorer`` keeps the current ``ScorerInput -> dict`` behavior.
+    ``@scorer(metric_type=..., outputs=...)`` exposes ``descriptor`` and
+    ``to_metric()`` for adapting scorer functions to the shared Metric protocol.
+    Class-based metrics can use ``@scorer`` on the class, then pass configured
+    metric instances directly to ``@benchmark``.
+    """
+    if isinstance(fn, type):
+        if any(option is not None for option in (metric_type, outputs, config_schema)):
+            raise ValueError("class-based Metric scorers do not accept @scorer metric contract options")
+        return _decorate_metric_scorer_class(cast(type[Metric], fn))
+
+    if fn is not None and isinstance(fn, Metric):
+        if any(option is not None for option in (metric_type, outputs, config_schema)):
+            raise ValueError("class-based Metric scorer instances do not accept @scorer metric contract options")
+        return _attach_metric_scorer_instance(fn)
+
+    if fn is not None and not callable(fn):
+        raise TypeError("class-based Metric scorer instances must implement the Metric protocol")
+
+    if outputs is None and (metric_type is not None or config_schema is not None):
+        metric_options = [
+            option
+            for option, value in (
+                ("metric_type=...", metric_type),
+                ("config_schema=...", config_schema),
+            )
+            if value is not None
+        ]
+        raise ValueError(
+            f"@scorer({', '.join(metric_options)}) opts into the Metric contract, but no outputs were declared. "
+            "Pass outputs=[MetricOutputSpec(...)] so the metric descriptor can declare and validate outputs."
+        )
+    if outputs is not None and metric_type is None:
+        raise ValueError(
+            "@scorer(outputs=...) opts into the Metric contract, but no metric_type was declared. "
+            "Pass metric_type='...' so the metric has a stable identity across refactors."
+        )
+
+    def decorate(fn: Callable[..., ScorerReturn]) -> object:
+        return _decorate_scorer(
+            cast(ScorerCallable[ScorerConfig], fn),
+            metric_type=metric_type,
+            outputs=outputs,
+            config_schema=config_schema,
+        )
+
+    return decorate(cast(Callable[..., ScorerReturn], fn)) if fn is not None else decorate
+
+
+def _decorate_scorer(
+    fn: ScorerCallable[ConfigT],
+    *,
+    metric_type: str | None = None,
+    outputs: list[MetricOutputSpec] | None = None,
+    config_schema: type[BaseModel] | None = None,
+):
+    setattr(fn, "_is_scorer", True)
+    if outputs is None:
+        return fn
+    if metric_type is None:
+        raise ValueError("metric_type is required when outputs are declared")
+
+    descriptor = MetricDescriptor(
+        type=metric_type,
+        outputs=outputs,
+        config_schema=config_schema,
+    )
+
+    def to_metric() -> ScorerFunctionMetric[ConfigT]:
+        return ScorerFunctionMetric(
+            descriptor=descriptor,
+            scorer_fn=fn,
+        )
+
+    setattr(fn, "descriptor", descriptor)
+    setattr(fn, "to_metric", to_metric)
+    return fn
+
+def image_builder(builder_fn: Callable[[list[dict[str, Any]]], ImageBuildRequest]):
     """Declare images that need building, stacked with ``@benchmark``.
 
     ``builder_fn`` receives the dataset rows and returns an
diff --git a/src/nemo_evaluator/metrics/__init__.py b/src/nemo_evaluator/metrics/__init__.py
index a40ca42fd..d13b6fb50 100644
--- a/src/nemo_evaluator/metrics/__init__.py
+++ b/src/nemo_evaluator/metrics/__init__.py
@@ -27,8 +27,10 @@
     permutation_test,
     sign_test,
 )
+from nemo_evaluator.metrics.exact_match import ExactMatchMetric
 
 __all__ = [
+    "ExactMatchMetric",
     "McNemarResult",
     "POWER_80_FACTOR",
     "PermutationResult",
diff --git a/src/nemo_evaluator/metrics/exact_match.py b/src/nemo_evaluator/metrics/exact_match.py
new file mode 100644
index 000000000..8cd4beb1d
--- /dev/null
+++ b/src/nemo_evaluator/metrics/exact_match.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Class-based exact-match metric implementation."""
+
+from __future__ import annotations
+
+import re
+import string
+from typing import Literal
+
+from jinja2 import Environment, StrictUndefined
+from pydantic import BaseModel, ConfigDict, Field
+
+from nemo_evaluator.scoring import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
+
+_JINJA_ENV = Environment(undefined=StrictUndefined, autoescape=False)
+
+__all__ = ["ExactMatchMetric"]
+
+
+class ExactMatchMetric(BaseModel):
+    """Exact-match metric using the shared MetricInput -> MetricResult contract."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    reference: str = Field(description="Jinja template for the expected reference answer.")
+    candidate: str | None = Field(
+        default=None,
+        description="Optional Jinja template for the candidate. Defaults to sample.output_text.",
+    )
+    type: Literal["exact-match"] = Field(default="exact-match", description="Stable metric type identifier.")
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [MetricOutputSpec.continuous_score("correct")]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        item, sample = _template_payload_from_metric_input(input)
+        context = _build_template_context(item, sample)
+        reference = _render_template(self.reference, context)
+        candidate = _render_template(self.candidate, context) if self.candidate is not None else sample.get("output_text")
+        if not isinstance(reference, str):
+            raise TypeError("ExactMatchMetric reference must render to a string.")
+        if not isinstance(candidate, str):
+            raise TypeError("ExactMatchMetric candidate must render to a string.")
+        correct = 1.0 if _normalize(candidate) == _normalize(reference) else 0.0
+        return MetricResult(outputs=[MetricOutput(name="correct", value=correct)])
+
+
+def _template_payload_from_metric_input(input: MetricInput) -> tuple[dict[str, object], dict[str, object]]:
+    item = dict(input.row.data)
+    sample = dict(input.candidate.metadata)
+    if input.candidate.output_text is not None:
+        sample["output_text"] = input.candidate.output_text
+    if input.candidate.response is not None:
+        sample["response"] = input.candidate.response
+    if input.candidate.trajectory is not None:
+        sample["trajectory"] = input.candidate.trajectory
+    return item, sample
+
+
+def _build_template_context(item: dict[str, object], sample: dict[str, object]) -> dict[str, object]:
+    return {**item, **sample, "item": item, "sample": sample}
+
+
+def _render_template(template: str, context: dict[str, object]) -> str:
+    return _JINJA_ENV.from_string(template).render(context)
+
+
+def _normalize(value: str) -> str:
+    value = value.lower()
+    value = re.sub(r"\b(a|an|the)\b", " ", value)
+    value = "".join(ch for ch in value if ch not in set(string.punctuation))
+    return " ".join(value.split())
diff --git a/src/nemo_evaluator/scorers.py b/src/nemo_evaluator/scorers.py
new file mode 100644
index 000000000..3d60739c7
--- /dev/null
+++ b/src/nemo_evaluator/scorers.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""BYOB scorer adapters for reusable Metric implementations."""
+
+from __future__ import annotations
+
+from nemo_evaluator.environments.custom import scorer
+from nemo_evaluator.metrics import ExactMatchMetric
+
+__all__ = ["ExactMatchScorer"]
+
+
+ExactMatchScorer = scorer(ExactMatchMetric)
diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py
index 3ca193cf7..7aea45602 100644
--- a/src/nemo_evaluator/scoring/__init__.py
+++ b/src/nemo_evaluator/scoring/__init__.py
@@ -22,6 +22,8 @@
 
 from typing import Callable
 
+from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async
+from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema
 from nemo_evaluator.scoring.judge import (
     JudgeScoringConfig,
     build_judge_prompt,
@@ -29,9 +31,27 @@
     needs_judge,
     parse_judge_response,
 )
-from nemo_evaluator.scoring.json_schema import extract_json, validate_json_schema
+from nemo_evaluator.scoring.metric import (
+    BooleanValue,
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    DiscreteScore,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerCallable,
+    ScorerConfig,
+    ScorerFunctionMetric,
+    ScorerReturn,
+    score_names_from_output_spec,
+)
 from nemo_evaluator.scoring.pattern import answer_line, multichoice_regex, numeric_match
-from nemo_evaluator.scoring.code_execution import code_sandbox, code_sandbox_async
 from nemo_evaluator.scoring.text import exact_match, extract_mcq_letter, fuzzy_match
 from nemo_evaluator.scoring.types import ScorerInput
 
@@ -65,6 +85,24 @@ def list_scorers() -> list[str]:
 
 __all__ = [
     "ScorerInput",
+    "Metric",
+    "BooleanValue",
+    "DatasetRow",
+    "ContinuousScore",
+    "CandidateOutput",
+    "DiscreteScore",
+    "Label",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricDescriptor",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
     "get_scorer",
     "list_scorers",
     # Text
diff --git a/src/nemo_evaluator/scoring/metric.py b/src/nemo_evaluator/scoring/metric.py
new file mode 100644
index 000000000..c4bf72d20
--- /dev/null
+++ b/src/nemo_evaluator/scoring/metric.py
@@ -0,0 +1,368 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared MetricInput -> MetricResult runtime contract."""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Awaitable, Callable, Mapping
+from typing import Generic, Protocol, TypeVar, cast, runtime_checkable
+
+from pydantic import BaseModel, ConfigDict, Field, RootModel, SkipValidation, field_validator, model_validator
+
+from nemo_evaluator.sandbox.base import Sandbox
+from nemo_evaluator.scoring.types import ScorerInput
+
+ConfigT = TypeVar("ConfigT", bound=Mapping[str, object] | BaseModel)
+SchemaT = TypeVar("SchemaT", bound=BaseModel)
+
+
+class DatasetRow(BaseModel):
+    """Original benchmark dataset row plus optional stable row identity."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    row_index: int | None = None
+    data: dict[str, object]
+
+
+class CandidateOutput(BaseModel):
+    """Candidate output being scored for one dataset row."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    output_text: str | None = None
+    response: object | None = None
+    trajectory: object | None = None
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class MetricInput(BaseModel):
+    """Complete per-row scoring input passed to a metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    row: DatasetRow
+    candidate: CandidateOutput
+
+
+class ContinuousScore(RootModel[float]):
+    """Continuous numeric metric value."""
+
+
+class DiscreteScore(RootModel[int]):
+    """Discrete numeric metric value."""
+
+
+class Label(RootModel[str]):
+    """String label metric value."""
+
+
+class BooleanValue(RootModel[bool]):
+    """Boolean metric value."""
+
+
+class MetricOutputSpec(BaseModel, Generic[SchemaT]):
+    """Schema for one named value emitted by a metric."""
+
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    name: str
+    description: str | None = None
+    value_schema: type[SchemaT]
+
+    @field_validator("name")
+    @classmethod
+    def _name_must_not_be_empty(cls, value: str) -> str:
+        if not value:
+            raise ValueError("metric output name must not be empty")
+        return value
+
+    @staticmethod
+    def continuous_score(name: str, description: str | None = None) -> MetricOutputSpec[ContinuousScore]:
+        return MetricOutputSpec[ContinuousScore](name=name, description=description, value_schema=ContinuousScore)
+
+    @staticmethod
+    def discrete_score(name: str, description: str | None = None) -> MetricOutputSpec[DiscreteScore]:
+        return MetricOutputSpec[DiscreteScore](name=name, description=description, value_schema=DiscreteScore)
+
+    @staticmethod
+    def label(name: str, description: str | None = None) -> MetricOutputSpec[Label]:
+        return MetricOutputSpec[Label](name=name, description=description, value_schema=Label)
+
+    @staticmethod
+    def boolean(name: str, description: str | None = None) -> MetricOutputSpec[BooleanValue]:
+        return MetricOutputSpec[BooleanValue](name=name, description=description, value_schema=BooleanValue)
+
+    @staticmethod
+    def model(
+        name: str,
+        value_schema: type[SchemaT],
+        description: str | None = None,
+    ) -> MetricOutputSpec[SchemaT]:
+        return MetricOutputSpec[SchemaT](name=name, description=description, value_schema=value_schema)
+
+    def coerce_value(self, value: object) -> SchemaT:
+        """Validate and coerce a raw output value to this spec's declared schema."""
+        return self.value_schema.model_validate(value)
+
+    def coerce_output(self, output: MetricOutput) -> SchemaT:
+        """Validate and coerce a named metric output against this spec."""
+        if output.name != self.name:
+            raise ValueError(f"Expected metric output {self.name!r}, got {output.name!r}")
+        return self.coerce_value(output.value)
+
+    def value_json_schema(self) -> dict[str, object]:
+        return self.value_schema.model_json_schema()
+
+
+class MetricDescriptor(BaseModel):
+    """Metadata needed to materialize a decorated scorer as a Metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    type: str
+    outputs: list[MetricOutputSpec] = Field(min_length=1)
+    config_schema: type[BaseModel] | None = None
+
+    @field_validator("type")
+    @classmethod
+    def _type_must_not_be_empty(cls, value: str) -> str:
+        if not value:
+            raise ValueError("metric type must not be empty")
+        return value
+
+
+    @field_validator("outputs")
+    @classmethod
+    def _output_names_must_be_unique(
+        cls, value: list[MetricOutputSpec]
+    ) -> list[MetricOutputSpec]:
+        names = [output.name for output in value]
+        duplicates = sorted({name for name in names if names.count(name) > 1})
+        if duplicates:
+            raise ValueError(f"duplicate metric output names: {duplicates}")
+        return value
+
+
+class MetricOutput(BaseModel):
+    """One named value emitted by a metric."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    name: str
+    value: object
+
+
+class MetricResult(BaseModel):
+    """Structured row-level metric result."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    outputs: list[MetricOutput]
+
+
+@runtime_checkable
+class Metric(Protocol):
+    """Shared row-scoring primitive."""
+
+    @property
+    def type(self) -> str: ...
+
+    def output_spec(self) -> list[MetricOutputSpec]: ...
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult: ...
+
+
+ScorerReturn = Mapping[str, object] | Awaitable[Mapping[str, object]]
+ScorerCallable = Callable[[ScorerInput[ConfigT]], ScorerReturn]
+ScorerConfig = Mapping[str, object] | BaseModel
+
+
+class MetricScorerFunction(Protocol[ConfigT]):
+    """Decorated scorer function that can be materialized as a metric."""
+
+    @property
+    def descriptor(self) -> MetricDescriptor: ...
+
+    def __call__(self, sample: ScorerInput[ConfigT]) -> ScorerReturn: ...
+
+    def to_metric(self) -> ScorerFunctionMetric[ConfigT]: ...
+
+
+class ScorerFunctionMetric(BaseModel, Generic[ConfigT]):
+    """Metric adapter for decorator-authored OSS ScorerInput -> dict scorers."""
+
+    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
+
+    descriptor: MetricDescriptor
+    scorer_fn: ScorerCallable[ConfigT]
+    config: ConfigT | None = None
+    sandbox: SkipValidation[Sandbox] | None = None
+    target: object | None = None
+    target_field: str = "target"
+
+    @model_validator(mode="after")
+    def _validate_bound_config(self) -> ScorerFunctionMetric[ConfigT]:
+        if self.config is not None:
+            self.config = self._validate_config(self.config)
+        return self
+
+    @property
+    def type(self) -> str:
+        return self.descriptor.type
+
+    def bind(
+        self,
+        *,
+        config: ConfigT | None = None,
+        sandbox: "Sandbox | None" = None,
+        target: object | None = None,
+        target_field: str | None = None,
+    ) -> ScorerFunctionMetric[ConfigT]:
+        validated_config = self.config if config is None else self._validate_config(config)
+        return self.model_copy(
+            update={
+                "config": validated_config,
+                "sandbox": self.sandbox if sandbox is None else sandbox,
+                "target": self.target if target is None else target,
+                "target_field": self.target_field if target_field is None else target_field,
+            }
+        )
+
+    def bind_raw_config(
+        self,
+        *,
+        config: ScorerConfig | None = None,
+        sandbox: "Sandbox | None" = None,
+        target: object | None = None,
+        target_field: str | None = None,
+    ) -> ScorerFunctionMetric[ConfigT]:
+        """Bind dict-like runtime config, validating it against ``config_schema`` when present."""
+        validated_config = self.config if config is None else self._validate_config(config, coerce=True)
+        return self.model_copy(
+            update={
+                "config": validated_config,
+                "sandbox": self.sandbox if sandbox is None else sandbox,
+                "target": self.target if target is None else target,
+                "target_field": self.target_field if target_field is None else target_field,
+            }
+        )
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return self.descriptor.outputs
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        sample: ScorerInput[ConfigT] = ScorerInput(
+            response=input.candidate.output_text or "",
+            target=self.target if self.target is not None else input.row.data.get(self.target_field),
+            metadata=dict(input.row.data),
+            config=cast(ConfigT, self._resolve_config()),
+            sandbox=self.sandbox,
+        )
+        result = self.scorer_fn(sample)
+        if inspect.isawaitable(result):
+            result = await result
+        if not isinstance(result, Mapping):
+            raise TypeError(f"scorer_fn must return a mapping, got {type(result).__name__}")
+        metric_result = MetricResult(
+            outputs=[MetricOutput(name=name, value=value) for name, value in cast(Mapping[str, object], result).items()]
+        )
+        return validate_metric_result(metric_result, self.descriptor.outputs)
+
+    def _validate_config(
+        self, config: ConfigT | ScorerConfig, *, coerce: bool = False
+    ) -> ConfigT:
+        schema = self.descriptor.config_schema
+        if schema is None:
+            if isinstance(config, BaseModel):
+                return cast(ConfigT, config.model_dump())
+            return cast(ConfigT, dict(cast(Mapping[str, object], config)))
+        if isinstance(config, schema):
+            return cast(ConfigT, config)
+        if not coerce:
+            raise TypeError(
+                f"config must be an instance of {schema.__name__}; "
+                "use bind_raw_config(...) to validate dict-like runtime config"
+            )
+        payload = (
+            cast(Mapping[str, object], config.model_dump())
+            if isinstance(config, BaseModel)
+            else dict(cast(Mapping[str, object], config))
+        )
+        return cast(ConfigT, schema.model_validate(payload))
+
+    def _resolve_config(self) -> ConfigT | Mapping[str, object]:
+        if self.config is not None:
+            return self.config
+        schema = self.descriptor.config_schema
+        if schema is None:
+            return {}
+        return cast(ConfigT, schema.model_validate({}))
+
+
+def validate_metric_result(result: MetricResult, outputs: list[MetricOutputSpec]) -> MetricResult:
+    """Validate a metric result against its declared outputs."""
+    returned_names = [output.name for output in result.outputs]
+    duplicates = sorted({name for name in returned_names if returned_names.count(name) > 1})
+    if duplicates:
+        raise ValueError(f"Duplicate metric output names: {duplicates}")
+
+    outputs_by_name = {output.name: output for output in outputs}
+    declared_names = [output.name for output in outputs]
+    declared = set(declared_names)
+    returned = set(returned_names)
+    missing = [name for name in declared_names if name not in returned]
+    undeclared = [name for name in returned_names if name not in declared]
+
+    if missing:
+        raise ValueError(f"Missing declared metric outputs: {missing}")
+    if undeclared:
+        raise ValueError(f"Undeclared metric outputs: {undeclared}")
+    for output in result.outputs:
+        outputs_by_name[output.name].coerce_output(output)
+    return result
+
+
+def score_names_from_output_spec(outputs: list[MetricOutputSpec]) -> list[str]:
+    """Return declared numeric score names from metric output specs."""
+    return [
+        output.name
+        for output in outputs
+        if issubclass(output.value_schema, ContinuousScore | DiscreteScore | BooleanValue)
+    ]
+
+
+__all__ = [
+    "BooleanValue",
+    "CandidateOutput",
+    "ContinuousScore",
+    "DatasetRow",
+    "DiscreteScore",
+    "Label",
+    "Metric",
+    "MetricDescriptor",
+    "MetricInput",
+    "MetricOutput",
+    "MetricOutputSpec",
+    "MetricResult",
+    "MetricScorerFunction",
+    "ScorerCallable",
+    "ScorerConfig",
+    "ScorerFunctionMetric",
+    "ScorerReturn",
+    "score_names_from_output_spec",
+    "validate_metric_result",
+]
diff --git a/src/nemo_evaluator/scoring/types.py b/src/nemo_evaluator/scoring/types.py
index fa2426a5a..fe3a7fa43 100644
--- a/src/nemo_evaluator/scoring/types.py
+++ b/src/nemo_evaluator/scoring/types.py
@@ -17,14 +17,17 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
 
 if TYPE_CHECKING:
     from nemo_evaluator.sandbox.base import Sandbox
 
 
+ConfigT = TypeVar("ConfigT")
+
+
 @dataclass
-class ScorerInput:
+class ScorerInput(Generic[ConfigT]):
     """Input passed to scorer functions.
 
     The ``sandbox`` field is available for scorers that need to inspect or
@@ -36,5 +39,5 @@ class ScorerInput:
     response: str
     target: Any
     metadata: dict[str, Any] = field(default_factory=dict)
-    config: dict[str, Any] = field(default_factory=dict)
+    config: ConfigT = field(default_factory=dict)
     sandbox: Sandbox | None = None
diff --git a/tests/test_environments/test_custom_metric_contract.py b/tests/test_environments/test_custom_metric_contract.py
new file mode 100644
index 000000000..27386194c
--- /dev/null
+++ b/tests/test_environments/test_custom_metric_contract.py
@@ -0,0 +1,230 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for BYOB scorer compatibility with the shared metric contract."""
+
+from __future__ import annotations
+
+from typing import Protocol, cast
+
+import pytest
+from pydantic import BaseModel
+
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, scorer
+from nemo_evaluator.metrics import ExactMatchMetric
+from nemo_evaluator.scorers import ExactMatchScorer
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import Metric, MetricDescriptor, MetricInput, MetricOutputSpec, MetricResult
+from nemo_evaluator.sandbox.base import Sandbox
+
+
+def _dataset() -> list[dict[str, object]]:
+    return [{"question": "2+2", "answer": "4", "category": "math"}]
+
+
+class ThresholdConfig(BaseModel):
+    threshold: float
+
+
+class _MetricScorerForTest(Protocol):
+    @property
+    def descriptor(self) -> MetricDescriptor: ...
+
+    def to_metric(self) -> Metric: ...
+
+
+@pytest.mark.asyncio
+async def test_plain_scorer_decorator_keeps_current_dict_path() -> None:
+    @scorer
+    def plain_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        assert sample.metadata["category"] == "math"
+        assert sample.config["tolerance"] == "exact"
+        return {"correct": True, "extracted": "4", "label": "exact"}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="plain_contract",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            extra={"tolerance": "exact"},
+            scorer_fn=plain_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", category="math")
+
+    assert result.reward == 1.0
+    assert result.extracted_answer == "4"
+    assert result.scoring_details == {
+        "method": "byob_plain_contract",
+        "correct": True,
+        "extracted": "4",
+        "label": "exact",
+    }
+
+
+@pytest.mark.asyncio
+async def test_typed_scorer_runs_as_metric_through_byob_verify() -> None:
+    outputs = [
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.continuous_score("format"),
+        MetricOutputSpec.label("judge_label"),
+        MetricOutputSpec.label("extracted"),
+    ]
+    sandbox = cast(Sandbox, object())
+
+    @scorer(metric_type="tests.typed_byob", outputs=outputs, config_schema=ThresholdConfig)
+    async def typed_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        assert "answer" not in sample.metadata
+        assert sample.metadata["category"] == "math"
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.75
+        assert sample.sandbox is sandbox
+        return {"reward": sample.config.threshold, "format": 1.0, "judge_label": "partial", "extracted": "4"}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="typed_contract",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            extra={"threshold": "0.75"},
+            scorer_fn=typed_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", sandbox=sandbox, category="math")
+
+    assert result.reward == 0.75
+    assert result.extracted_answer == "4"
+    assert result.scoring_details == {
+        "method": "byob_typed_contract",
+        "metric_type": "tests.typed_byob",
+        "outputs": {"reward": 0.75, "format": 1.0, "judge_label": "partial", "extracted": "4"},
+        "reward": 0.75,
+        "format": 1.0,
+        "judge_label": "partial",
+        "extracted": "4",
+    }
+
+
+@pytest.mark.asyncio
+async def test_preannotated_metric_scorer_runs_through_byob_verify() -> None:
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="exact_match_metric",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            scorer_fn=cast(_MetricScorerForTest, ExactMatchScorer(reference="{{item.answer}}")),
+        )
+    )
+
+    matched = await env.verify("4", "4", category="math", answer="4")
+    mismatched = await env.verify("5", "4", category="math", answer="4")
+
+    assert matched.reward == 1.0
+    assert matched.scoring_details == {
+        "method": "byob_exact_match_metric",
+        "metric_type": "exact-match",
+        "outputs": {"correct": 1.0},
+        "correct": 1.0,
+    }
+    assert mismatched.reward == 0.0
+    assert mismatched.scoring_details["outputs"] == {"correct": 0.0}
+
+
+@pytest.mark.asyncio
+async def test_configured_metric_instance_runs_through_byob_verify() -> None:
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="exact_match_metric_instance",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            scorer_fn=scorer(ExactMatchMetric(reference="{{item.answer}}")),
+        )
+    )
+
+    result = await env.verify("4", "4", category="math", answer="4")
+
+    assert result.reward == 1.0
+    assert result.scoring_details["metric_type"] == "exact-match"
+    assert result.scoring_details["outputs"] == {"correct": 1.0}
+
+
+@pytest.mark.asyncio
+async def test_class_based_metric_result_is_validated_by_byob_verify() -> None:
+    @scorer
+    class MissingOutputMetric:
+        type = "tests.missing_output"
+
+        def output_spec(self) -> list[MetricOutputSpec]:
+            return [MetricOutputSpec.continuous_score("correct")]
+
+        async def compute_scores(self, input: MetricInput) -> MetricResult:
+            return MetricResult(outputs=[])
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name="missing_output_metric",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            scorer_fn=cast(_MetricScorerForTest, MissingOutputMetric()),
+        )
+    )
+
+    with pytest.raises(ValueError, match="Missing declared metric outputs"):
+        await env.verify("4", "4", category="math")
+
+
+@pytest.mark.parametrize(
+    ("score_names", "score_values", "expected_reward"),
+    [
+        (["reward", "correct"], {"reward": 0.2, "correct": 1.0}, 0.2),
+        (["quality", "correct"], {"quality": 0.2, "correct": 1.0}, 1.0),
+        (["quality", "format"], {"quality": 0.4, "format": 1.0}, 0.4),
+    ],
+)
+@pytest.mark.asyncio
+async def test_typed_scorer_reward_selection(
+    score_names: list[str], score_values: dict[str, float], expected_reward: float
+) -> None:
+    outputs = [MetricOutputSpec.continuous_score(name) for name in score_names]
+
+    @scorer(metric_type=f"tests.reward_selection.{score_names[0]}", outputs=outputs)
+    def typed_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "4"
+        assert sample.target == "4"
+        return {name: score_values[name] for name in score_names}
+
+    env = ByobEnvironment(
+        BenchmarkDefinition(
+            name=f"typed_reward_{score_names[0]}",
+            dataset=_dataset,
+            prompt="{question}",
+            target_field="answer",
+            scorer_fn=typed_scorer,
+        )
+    )
+
+    result = await env.verify("4", "4", category="math")
+
+    assert result.reward == expected_reward
diff --git a/tests/test_integration/test_eval_loop_integration.py b/tests/test_integration/test_eval_loop_integration.py
index d96098c24..6f8da2588 100644
--- a/tests/test_integration/test_eval_loop_integration.py
+++ b/tests/test_integration/test_eval_loop_integration.py
@@ -15,11 +15,19 @@
 """Integration tests: run_evaluation end-to-end with mock solver."""
 
 import asyncio
+from typing import Any
 
 
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
+from nemo_evaluator.environments.custom import BenchmarkDefinition, ByobEnvironment, benchmark, scorer
+from nemo_evaluator.environments.registry import get_environment
 from nemo_evaluator.engine.eval_loop import run_evaluation
+from nemo_evaluator.metrics import ExactMatchMetric
+from nemo_evaluator.scorers import ExactMatchScorer
 from nemo_evaluator.observability.types import ModelResponse
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import MetricOutputSpec
+from nemo_evaluator.sandbox.base import Sandbox
 from nemo_evaluator.solvers import SolveResult
 
 
@@ -38,7 +46,9 @@ async def seed(self, idx):
         r = self._dataset[idx]
         return SeedResult(prompt=r["q"], expected_answer=r["a"], metadata={"idx": idx})
 
-    async def verify(self, response, expected, **meta):
+    async def verify(
+        self, response: str, expected: str, sandbox: Sandbox | None = None, **meta: Any
+    ) -> VerifyResult:
         correct = response.strip() == expected.strip()
         return VerifyResult(
             reward=1.0 if correct else 0.0, extracted_answer=response.strip(), scoring_details={"method": "exact"}
@@ -118,7 +128,7 @@ async def tracking_close():
             closed.append(True)
             await original_close()
 
-        env.close = tracking_close
+        env.close = tracking_close  # type: ignore[method-assign]  # ty: ignore[invalid-assignment]
         solver = _MockSolver()
         asyncio.run(run_evaluation(env, solver, n_repeats=1))
         assert closed, "env.close() was not called"
@@ -191,6 +201,87 @@ def test_concurrent_execution(self):
         results = bundle["_results"]
         assert len(results) == 6
 
+    def test_typed_byob_metric_result_preserved_in_results(self):
+        outputs = [
+            MetricOutputSpec.continuous_score("reward"),
+            MetricOutputSpec.continuous_score("format"),
+            MetricOutputSpec.label("judge_label"),
+            MetricOutputSpec.label("rationale"),
+        ]
+
+        @scorer(metric_type="tests.eval_loop_typed", outputs=outputs)
+        def typed_scorer(sample: ScorerInput) -> dict[str, object]:
+            matched = sample.response == sample.target
+            return {
+                "reward": 0.8 if matched else 0.0,
+                "format": 1.0,
+                "judge_label": "pass",
+                "rationale": "answer matched",
+            }
+
+        env = ByobEnvironment(
+            BenchmarkDefinition(
+                name="typed_eval_loop",
+                dataset=lambda: [{"question": "1+1", "answer": "2"}],
+                prompt="{question}",
+                target_field="answer",
+                scorer_fn=typed_scorer,
+            )
+        )
+        solver = _MockSolver()
+
+        bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1))
+
+        result = bundle["_results"][0]
+        assert result["reward"] == 0.8
+        assert result["scoring_details"]["reward"] == 0.8
+        assert result["scoring_details"]["format"] == 1.0
+        assert result["scoring_details"]["outputs"] == {
+            "reward": 0.8,
+            "format": 1.0,
+            "judge_label": "pass",
+            "rationale": "answer matched",
+        }
+        assert result["scoring_details"]["judge_label"] == "pass"
+        assert result["scoring_details"]["rationale"] == "answer matched"
+        assert result["scoring_details"]["metric_type"] == "tests.eval_loop_typed"
+
+    def test_preannotated_metric_scorer_result_preserved_in_results(self):
+        benchmark(
+            name="exact_match_metric_eval_loop_adapter",
+            dataset=lambda: [{"question": "1+1", "answer": "2"}],
+            prompt="{question}",
+            target_field="answer",
+        )(ExactMatchScorer(reference="{{item.answer}}"))
+
+        env = get_environment("exact_match_metric_eval_loop_adapter")
+        solver = _MockSolver()
+
+        bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1))
+
+        result = bundle["_results"][0]
+        assert result["reward"] == 1.0
+        assert result["scoring_details"]["metric_type"] == "exact-match"
+        assert result["scoring_details"]["outputs"] == {"correct": 1.0}
+
+    def test_configured_metric_instance_result_preserved_in_results(self):
+        benchmark(
+            name="exact_match_metric_instance_eval_loop_adapter",
+            dataset=lambda: [{"question": "1+1", "answer": "2"}],
+            prompt="{question}",
+            target_field="answer",
+        )(scorer(ExactMatchMetric(reference="{{item.answer}}")))
+
+        env = get_environment("exact_match_metric_instance_eval_loop_adapter")
+        solver = _MockSolver()
+
+        bundle = asyncio.run(run_evaluation(env, solver, n_repeats=1))
+
+        result = bundle["_results"][0]
+        assert result["reward"] == 1.0
+        assert result["scoring_details"]["metric_type"] == "exact-match"
+        assert result["scoring_details"]["outputs"] == {"correct": 1.0}
+
 
 class _MockSolverWithTrajectory:
     """Always correct; returns a non-empty trajectory so we can assert it survives resume."""
diff --git a/tests/test_scoring/test_metric_contract.py b/tests/test_scoring/test_metric_contract.py
new file mode 100644
index 000000000..3e094a487
--- /dev/null
+++ b/tests/test_scoring/test_metric_contract.py
@@ -0,0 +1,542 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the shared MetricInput -> MetricResult contract."""
+
+from __future__ import annotations
+
+from typing import Protocol, cast
+
+import pytest
+from pydantic import BaseModel, ValidationError
+
+from nemo_evaluator.environments.custom import scorer
+from nemo_evaluator.metrics import ExactMatchMetric
+from nemo_evaluator.scorers import ExactMatchScorer
+from nemo_evaluator.scoring import ScorerInput
+from nemo_evaluator.scoring.metric import (
+    CandidateOutput,
+    ContinuousScore,
+    DatasetRow,
+    Label,
+    Metric,
+    MetricDescriptor,
+    MetricInput,
+    MetricOutput,
+    MetricOutputSpec,
+    MetricResult,
+    MetricScorerFunction,
+    ScorerFunctionMetric,
+    score_names_from_output_spec,
+    validate_metric_result,
+)
+from nemo_evaluator.sandbox.base import Sandbox
+
+
+class ThresholdConfig(BaseModel):
+    threshold: float
+    label: str = "pass"
+
+
+class OtherThresholdConfig(BaseModel):
+    threshold: float
+    label: str = "other"
+
+
+class JudgeDetails(BaseModel):
+    label: str
+    rationale: str
+    confidence: float
+
+
+class _MetricScorerForTest(Protocol):
+    @property
+    def descriptor(self) -> MetricDescriptor: ...
+
+    def to_metric(self) -> Metric: ...
+
+
+def test_metric_input_groups_row_and_candidate() -> None:
+    metric_input = MetricInput(
+        row=DatasetRow(row_index=7, data={"answer": "Paris", "category": "geography"}),
+        candidate=CandidateOutput(output_text="Paris", metadata={"model": "mock"}),
+    )
+
+    assert metric_input.row.row_index == 7
+    assert metric_input.candidate.output_text == "Paris"
+    assert metric_input.row.data["answer"] == "Paris"
+    assert not hasattr(metric_input, "sandbox")
+    assert not hasattr(metric_input, "config")
+
+
+def test_metric_output_spec_convenience_constructors_and_json_schema() -> None:
+    score = MetricOutputSpec.continuous_score("reward", "Reward score")
+    label = MetricOutputSpec.label("judge_label")
+    details = MetricOutputSpec.model("judge_details", JudgeDetails)
+
+    assert score.name == "reward"
+    assert score.description == "Reward score"
+    assert score.value_schema is ContinuousScore
+    assert score.value_json_schema()["type"] == "number"
+    assert label.value_schema is Label
+    assert details.value_schema is JudgeDetails
+    schema_properties = cast(dict[str, object], details.value_json_schema()["properties"])
+    confidence_schema = cast(dict[str, object], schema_properties["confidence"])
+    assert confidence_schema["type"] == "number"
+
+
+def test_metric_output_spec_coerces_values_to_declared_schema() -> None:
+    reward = MetricOutputSpec.continuous_score("reward")
+    details = MetricOutputSpec.model("judge_details", JudgeDetails)
+
+    coerced_reward = reward.coerce_output(MetricOutput(name="reward", value=1))
+    coerced_details = details.coerce_value(
+        {"label": "pass", "rationale": "all checks passed", "confidence": 0.9}
+    )
+
+    assert isinstance(coerced_reward, ContinuousScore)
+    assert coerced_reward.root == 1.0
+    assert isinstance(coerced_details, JudgeDetails)
+    assert coerced_details.label == "pass"
+
+    with pytest.raises(ValueError, match="Expected metric output"):
+        reward.coerce_output(MetricOutput(name="other", value=1))
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_adapts_dict_return_to_metric_outputs() -> None:
+    outputs = [
+        MetricOutputSpec.boolean("correct"),
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.discrete_score("attempts"),
+        MetricOutputSpec.label("extracted"),
+        MetricOutputSpec.model("judge", JudgeDetails),
+    ]
+    descriptor = MetricDescriptor(type="tests.dict_adapter", outputs=outputs)
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {
+            "correct": True,
+            "reward": 0.25,
+            "attempts": 2,
+            "extracted": "A",
+            "judge": {"label": "partial", "rationale": "close", "confidence": 0.5},
+        }
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer)
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert {output.name: output.value for output in result.outputs} == {
+        "correct": True,
+        "reward": 0.25,
+        "attempts": 2,
+        "extracted": "A",
+        "judge": {"label": "partial", "rationale": "close", "confidence": 0.5},
+    }
+
+
+def test_validate_metric_result_accepts_declared_outputs() -> None:
+    outputs = [
+        MetricOutputSpec.continuous_score("reward"),
+        MetricOutputSpec.boolean("correct"),
+        MetricOutputSpec.label("label"),
+    ]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=True),
+            MetricOutput(name="correct", value=True),
+            MetricOutput(name="label", value="yes"),
+        ]
+    )
+
+    validated = validate_metric_result(result, outputs)
+
+    assert validated.outputs == [
+        MetricOutput(name="reward", value=True),
+        MetricOutput(name="correct", value=True),
+        MetricOutput(name="label", value="yes"),
+    ]
+
+
+def test_typed_scorer_decorator_exposes_config_schema() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+
+    @scorer(metric_type="tests.threshold", outputs=outputs, config_schema=ThresholdConfig)
+    def threshold_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        return {"reward": sample.config.threshold >= 0.5}
+
+    typed_scorer: MetricScorerFunction[ThresholdConfig] = threshold_scorer
+    metric: ScorerFunctionMetric[ThresholdConfig] = typed_scorer.to_metric()
+
+    assert isinstance(metric, BaseModel)
+    assert threshold_scorer.descriptor.config_schema is ThresholdConfig
+    assert metric.type == "tests.threshold"
+
+
+def test_typed_scorer_decorator_requires_metric_type_and_outputs_for_metric_contract_options() -> None:
+    with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"):
+        scorer(metric_type="tests.missing_outputs")  # type: ignore[reportArgumentType]  # ty: ignore[invalid-argument-type]
+
+    with pytest.raises(ValueError, match=r"Metric contract.*outputs=\[MetricOutputSpec"):
+        scorer(config_schema=ThresholdConfig)  # type: ignore[reportArgumentType]  # ty: ignore[invalid-argument-type]
+
+    with pytest.raises(ValueError, match="no metric_type was declared"):
+        scorer(outputs=[MetricOutputSpec.continuous_score("reward")])  # type: ignore[call-overload]  # ty: ignore[invalid-argument-type]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_prefers_bound_target_over_row_field() -> None:
+    outputs = [MetricOutputSpec.boolean("reward")]
+    descriptor = MetricDescriptor(type="tests.bound_target", outputs=outputs)
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.target == "expected-from-verify"
+        assert sample.metadata["answer"] == "answer-from-row"
+        return {"reward": True}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        target="expected-from-verify",
+        target_field="answer",
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "answer-from-row"}),
+            candidate=CandidateOutput(output_text="candidate"),
+        )
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=True)]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_accepts_typed_config_model() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.typed_config", outputs=outputs, config_schema=ThresholdConfig)
+
+    def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.5
+        assert sample.config.label == "typed"
+        return {"reward": sample.config.threshold}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        config=ThresholdConfig(threshold=0.5, label="typed")
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=0.5)]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_validates_raw_config_against_typed_schema() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.raw_typed_config", outputs=outputs, config_schema=ThresholdConfig)
+
+    def sync_scorer(sample: ScorerInput[ThresholdConfig]) -> dict[str, object]:
+        assert isinstance(sample.config, ThresholdConfig)
+        assert sample.config.threshold == 0.75
+        assert sample.config.label == "pass"
+        return {"reward": sample.config.threshold}
+
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=sync_scorer,
+    ).bind_raw_config(config={"threshold": "0.75"})
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert result.outputs == [MetricOutput(name="reward", value=0.75)]
+
+
+def test_scorer_function_metric_rejects_invalid_typed_config() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.invalid_config", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=lambda sample: {"reward": True})
+
+    with pytest.raises(ValidationError):
+        metric.bind_raw_config(config={"threshold": "not-a-number"})
+
+
+def test_scorer_function_metric_bind_rejects_wrong_config_model_subtype() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.wrong_config_subtype", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold},
+    )
+
+    with pytest.raises(TypeError, match="ThresholdConfig"):
+        metric.bind(config=OtherThresholdConfig(threshold=0.75))
+
+
+def test_scorer_function_metric_bind_rejects_raw_mapping_for_typed_config() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    descriptor = MetricDescriptor(type="tests.raw_config_on_typed_bind", outputs=outputs, config_schema=ThresholdConfig)
+    metric = ScorerFunctionMetric(
+        descriptor=descriptor,
+        scorer_fn=lambda sample: {"reward": cast(ThresholdConfig, sample.config).threshold},
+    )
+
+    with pytest.raises(TypeError, match="bind_raw_config"):
+        metric.bind(config={"threshold": 0.75})
+
+
+def test_validate_metric_result_rejects_duplicate_output_names() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=1.0),
+            MetricOutput(name="reward", value=0.0),
+        ]
+    )
+
+    with pytest.raises(ValueError, match="Duplicate metric output"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_missing_declared_outputs() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.continuous_score("format")]
+    result = MetricResult(outputs=[MetricOutput(name="reward", value=1.0)])
+
+    with pytest.raises(ValueError, match="Missing declared metric outputs"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_undeclared_outputs() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward")]
+    result = MetricResult(
+        outputs=[
+            MetricOutput(name="reward", value=1.0),
+            MetricOutput(name="format", value=1.0),
+        ]
+    )
+
+    with pytest.raises(ValueError, match="Undeclared metric outputs"):
+        validate_metric_result(result, outputs)
+
+
+def test_validate_metric_result_rejects_value_that_does_not_match_schema() -> None:
+    outputs = [MetricOutputSpec.model("judge_details", JudgeDetails)]
+    result = MetricResult(outputs=[MetricOutput(name="judge_details", value={"label": "pass"})])
+
+    with pytest.raises(ValidationError):
+        validate_metric_result(result, outputs)
+
+
+def test_scorer_decorator_returns_subclass_without_mutating_original_metric_class() -> None:
+    scorer_cls = scorer(ExactMatchMetric)
+
+    assert scorer_cls is not ExactMatchMetric
+    assert issubclass(scorer_cls, ExactMatchMetric)
+    assert not hasattr(ExactMatchMetric(reference="{{item.answer}}"), "descriptor")
+    assert not hasattr(ExactMatchMetric(reference="{{item.answer}}"), "to_metric")
+
+    scorer_metric = cast(_MetricScorerForTest, scorer_cls(reference="{{item.answer}}"))
+    assert scorer_metric.descriptor == MetricDescriptor(
+        type="exact-match",
+        outputs=[MetricOutputSpec.continuous_score("correct")],
+    )
+    assert scorer_metric.to_metric() is scorer_metric
+
+
+@pytest.mark.asyncio
+async def test_exact_match_metric_is_undecorated_reusable_metric() -> None:
+    metric = ExactMatchMetric(reference="{{item.answer}}")
+
+    assert not hasattr(metric, "descriptor")
+    assert not hasattr(metric, "to_metric")
+    assert isinstance(metric, BaseModel)
+    assert metric.type == "exact-match"
+    assert metric.model_dump()["type"] == "exact-match"
+    assert metric.output_spec() == [MetricOutputSpec.continuous_score("correct")]
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "Paris"}),
+            candidate=CandidateOutput(output_text="Paris"),
+        )
+    )
+
+    assert result.outputs == [MetricOutput(name="correct", value=1.0)]
+
+
+@pytest.mark.asyncio
+async def test_exact_match_scorer_exposes_descriptor_and_to_metric() -> None:
+    scorer_metric = cast(_MetricScorerForTest, ExactMatchScorer(reference="{{item.answer}}"))
+
+    assert scorer_metric.descriptor == MetricDescriptor(
+        type="exact-match",
+        outputs=[MetricOutputSpec.continuous_score("correct")],
+    )
+
+    metric = scorer_metric.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "Paris"}),
+            candidate=CandidateOutput(output_text="Paris"),
+        )
+    )
+
+    assert metric.type == "exact-match"
+    assert result.outputs == [MetricOutput(name="correct", value=1.0)]
+
+
+@pytest.mark.asyncio
+async def test_scorer_decorator_adapts_exact_match_metric_instances() -> None:
+    scorer_metric = scorer(ExactMatchMetric(reference="{{item.answer}}"))
+
+    assert scorer_metric.descriptor == MetricDescriptor(
+        type="exact-match",
+        outputs=[MetricOutputSpec.continuous_score("correct")],
+    )
+
+    metric = scorer_metric.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "Paris"}),
+            candidate=CandidateOutput(output_text="Paris"),
+        )
+    )
+
+    assert metric.type == "exact-match"
+    assert result.outputs == [MetricOutput(name="correct", value=1.0)]
+
+
+@pytest.mark.asyncio
+async def test_exact_match_metric_supports_top_level_and_sample_template_aliases() -> None:
+    metric = ExactMatchMetric(reference="{{answer}}", candidate="{{sample.prediction}}")
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "New York"}),
+            candidate=CandidateOutput(metadata={"prediction": "new york"}),
+        )
+    )
+
+    assert result.outputs == [MetricOutput(name="correct", value=1.0)]
+
+
+@pytest.mark.asyncio
+async def test_class_based_scorer_to_metric_returns_configured_instance() -> None:
+    @scorer
+    class MissingOutputMetric:
+        type = "tests.missing_output"
+
+        def output_spec(self) -> list[MetricOutputSpec]:
+            return [MetricOutputSpec.continuous_score("correct")]
+
+        async def compute_scores(self, input: MetricInput) -> MetricResult:
+            return MetricResult(outputs=[])
+
+    instance = cast(_MetricScorerForTest, MissingOutputMetric())
+    metric = instance.to_metric()
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={}), candidate=CandidateOutput(output_text="candidate"))
+    )
+
+    assert metric is instance
+    assert result == MetricResult(outputs=[])
+
+
+def test_scorer_decorator_can_adapt_metric_instances() -> None:
+    class UndecoratedMetric:
+        type = "tests.undecorated"
+
+        def output_spec(self) -> list[MetricOutputSpec]:
+            return [MetricOutputSpec.continuous_score("correct")]
+
+        async def compute_scores(self, input: MetricInput) -> MetricResult:
+            return MetricResult(outputs=[MetricOutput(name="correct", value=1.0)])
+
+    metric = scorer(UndecoratedMetric())
+
+    assert metric.descriptor == MetricDescriptor(
+        type="tests.undecorated",
+        outputs=[MetricOutputSpec.continuous_score("correct")],
+    )
+    assert metric.to_metric() is metric
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_executes_sync_scorers() -> None:
+    outputs = [MetricOutputSpec.boolean("reward"), MetricOutputSpec.label("label")]
+    descriptor = MetricDescriptor(type="tests.sync_metric", outputs=outputs)
+    sandbox = cast(Sandbox, object())
+
+    def sync_scorer(sample: ScorerInput) -> dict[str, object]:
+        assert sample.response == "yes"
+        assert sample.target == "yes"
+        assert sample.metadata["category"] == "boolean"
+        assert sample.config == {"mode": "strict"}
+        assert sample.sandbox is sandbox
+        return {"reward": True, "label": "matched"}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=sync_scorer).bind(
+        config={"mode": "strict"},
+        sandbox=sandbox,
+        target_field="answer",
+    )
+
+    result = await metric.compute_scores(
+        MetricInput(
+            row=DatasetRow(data={"answer": "yes", "category": "boolean"}),
+            candidate=CandidateOutput(output_text="yes"),
+        )
+    )
+
+    assert metric.type == "tests.sync_metric"
+    assert score_names_from_output_spec(metric.output_spec()) == ["reward"]
+    assert result.outputs == [MetricOutput(name="reward", value=True), MetricOutput(name="label", value="matched")]
+
+
+@pytest.mark.asyncio
+async def test_scorer_function_metric_executes_async_scorers() -> None:
+    outputs = [MetricOutputSpec.continuous_score("reward"), MetricOutputSpec.label("seen")]
+    descriptor = MetricDescriptor(type="tests.async_metric", outputs=outputs)
+
+    async def async_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {"reward": 0.5, "seen": sample.response}
+
+    metric = ScorerFunctionMetric(descriptor=descriptor, scorer_fn=async_scorer)
+
+    result = await metric.compute_scores(
+        MetricInput(row=DatasetRow(data={"answer": "yes"}), candidate=CandidateOutput(output_text="maybe"))
+    )
+
+    assert metric.type == "tests.async_metric"
+    assert score_names_from_output_spec(metric.output_spec()) == ["reward"]
+    assert result.outputs == [MetricOutput(name="reward", value=0.5), MetricOutput(name="seen", value="maybe")]
+
+
+def test_typed_scorer_decorator_exposes_descriptor_and_to_metric() -> None:
+    outputs = [MetricOutputSpec.boolean("truthful"), MetricOutputSpec.label("judge_grade")]
+
+    @scorer(metric_type="truthfulqa", outputs=outputs)
+    def truthfulqa_scorer(sample: ScorerInput) -> dict[str, object]:
+        return {"truthful": bool(sample.response), "judge_grade": "C"}
+
+    metric = truthfulqa_scorer.to_metric()
+
+    assert truthfulqa_scorer.descriptor == MetricDescriptor(type="truthfulqa", outputs=outputs)
+    assert metric.type == "truthfulqa"
+    assert score_names_from_output_spec(metric.output_spec()) == ["truthful"]