From 367d8604b087b214f7233a94ec9f50fc48a09469 Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Wed, 5 Nov 2025 13:13:33 -0600
Subject: [PATCH 1/9] Add metadata support to deferred tool exceptions

Enables CallDeferred and ApprovalRequired exceptions to carry arbitrary
metadata via an optional `metadata` parameter. The metadata is accessible
in DeferredToolRequests.metadata keyed by tool_call_id.

This allows tools to:
- Provide cost/time estimates for approval decisions
- Include task IDs for external execution tracking
- Store context about why approval is required
- Attach priority or urgency information

Backward compatible - metadata defaults to empty dict if not provided.
---
 docs/deferred-tools.md                       | 105 +++++++++++
 pydantic_ai_slim/pydantic_ai/_agent_graph.py |  16 +-
 pydantic_ai_slim/pydantic_ai/exceptions.py   |  16 +-
 pydantic_ai_slim/pydantic_ai/tools.py        |   6 +
 tests/evals/test_reporting.py                | 105 ++++++-----
 tests/test_agent.py                          |  10 +-
 tests/test_streaming.py                      |  25 ++-
 tests/test_tools.py                          | 172 ++++++++++++++++++-
 tests/test_ui.py                             |   2 +-
 9 files changed, 396 insertions(+), 61 deletions(-)

diff --git a/docs/deferred-tools.md b/docs/deferred-tools.md
index e5e5201163..b01d4d1f7b 100644
--- a/docs/deferred-tools.md
+++ b/docs/deferred-tools.md
@@ -320,6 +320,111 @@ async def main():
 
 _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_
 
+## Attaching Metadata to Deferred Tools
+
+Both [`CallDeferred`][pydantic_ai.exceptions.CallDeferred] and [`ApprovalRequired`][pydantic_ai.exceptions.ApprovalRequired] exceptions accept an optional `metadata` parameter that allows you to attach arbitrary context information to deferred tool calls. This metadata is then available in the [`DeferredToolRequests.metadata`][pydantic_ai.tools.DeferredToolRequests.metadata] dictionary, keyed by the tool call ID.
+
+Common use cases for metadata include:
+
+- Providing cost estimates or time estimates for approval decisions
+- Including task IDs or tracking information for external execution
+- Storing context about why approval is required
+- Attaching priority or urgency information
+
+Here's an example showing how to use metadata with both approval-required and external tools:
+
+```python {title="deferred_tools_with_metadata.py"}
+from pydantic_ai import (
+    Agent,
+    ApprovalRequired,
+    CallDeferred,
+    DeferredToolRequests,
+    DeferredToolResults,
+    RunContext,
+    ToolApproved,
+    ToolDenied,
+)
+
+agent = Agent('openai:gpt-5', output_type=[str, DeferredToolRequests])
+
+
+@agent.tool
+def expensive_compute(ctx: RunContext, task_id: str) -> str:
+    if not ctx.tool_call_approved:
+        raise ApprovalRequired(
+            metadata={
+                'task_id': task_id,
+                'estimated_cost_usd': 25.50,
+                'estimated_time_minutes': 15,
+                'reason': 'High compute cost',
+            }
+        )
+    return f'Task {task_id} completed'
+
+
+@agent.tool
+async def external_api_call(ctx: RunContext, endpoint: str) -> str:
+    # Schedule the external API call and defer execution
+    task_id = f'api_call_{ctx.tool_call_id}'
+
+    raise CallDeferred(
+        metadata={
+            'task_id': task_id,
+            'endpoint': endpoint,
+            'priority': 'high',
+        }
+    )
+
+
+result = agent.run_sync('Run expensive task-123 and call the /data endpoint')
+messages = result.all_messages()
+
+assert isinstance(result.output, DeferredToolRequests)
+requests = result.output
+
+# Handle approvals with metadata
+for call in requests.approvals:
+    metadata = requests.metadata.get(call.tool_call_id, {})
+    print(f"Approval needed for {call.tool_name}")
+    print(f"  Cost: ${metadata.get('estimated_cost_usd')}")
+    print(f"  Time: {metadata.get('estimated_time_minutes')} minutes")
+    print(f"  Reason: {metadata.get('reason')}")
+
+# Handle external calls with metadata
+for call in requests.calls:
+    metadata = requests.metadata.get(call.tool_call_id, {})
+    print(f"External call to {call.tool_name}")
+    print(f"  Task ID: {metadata.get('task_id')}")
+    print(f"  Priority: {metadata.get('priority')}")
+
+# Build results with approvals and external results
+results = DeferredToolResults()
+for call in requests.approvals:
+    metadata = requests.metadata.get(call.tool_call_id, {})
+    cost = metadata.get('estimated_cost_usd', 0)
+
+    if cost < 50:  # Approve if cost is under $50
+        results.approvals[call.tool_call_id] = ToolApproved()
+    else:
+        results.approvals[call.tool_call_id] = ToolDenied('Cost too high')
+
+for call in requests.calls:
+    metadata = requests.metadata.get(call.tool_call_id, {})
+    # Simulate getting result from external task
+    task_id = metadata.get('task_id')
+    results.calls[call.tool_call_id] = f'Result from {task_id}: success'
+
+result = agent.run_sync(message_history=messages, deferred_tool_results=results)
+print(result.output)
+"""
+I completed task-123 and retrieved data from the /data endpoint.
+"""
+```
+
+_(This example is complete, it can be run "as is")_
+
+The metadata dictionary can contain any JSON-serializable values and is entirely application-defined. If no metadata is provided when raising the exception, the tool call ID will still be present in the `metadata` dictionary with an empty dict as the value for backward compatibility.
+
 ## See Also
 
 - [Function Tools](tools.md) - Basic tool concepts and registration
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
index c167521079..d2547490d2 100644
--- a/pydantic_ai_slim/pydantic_ai/_agent_graph.py
+++ b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -883,6 +883,7 @@ async def process_tool_calls(  # noqa: C901
         calls_to_run = [call for call in calls_to_run if call.tool_call_id in calls_to_run_results]
 
     deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]] = defaultdict(list)
+    deferred_metadata: dict[str, dict[str, Any]] = {}
 
     if calls_to_run:
         async for event in _call_tools(
@@ -894,6 +895,7 @@ async def process_tool_calls(  # noqa: C901
             usage_limits=ctx.deps.usage_limits,
             output_parts=output_parts,
             output_deferred_calls=deferred_calls,
+            output_deferred_metadata=deferred_metadata,
         ):
             yield event
 
@@ -927,6 +929,7 @@ async def process_tool_calls(  # noqa: C901
         deferred_tool_requests = _output.DeferredToolRequests(
             calls=deferred_calls['external'],
             approvals=deferred_calls['unapproved'],
+            metadata=deferred_metadata,
         )
 
         final_result = result.FinalResult(cast(NodeRunEndT, deferred_tool_requests), None, None)
@@ -944,10 +947,12 @@ async def _call_tools(
     usage_limits: _usage.UsageLimits,
     output_parts: list[_messages.ModelRequestPart],
     output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
 ) -> AsyncIterator[_messages.HandleResponseEvent]:
     tool_parts_by_index: dict[int, _messages.ModelRequestPart] = {}
     user_parts_by_index: dict[int, _messages.UserPromptPart] = {}
     deferred_calls_by_index: dict[int, Literal['external', 'unapproved']] = {}
+    deferred_metadata_by_index: dict[int, dict[str, Any]] = {}
 
     if usage_limits.tool_calls_limit is not None:
         projected_usage = deepcopy(usage)
@@ -982,10 +987,12 @@ async def handle_call_or_result(
                 tool_part, tool_user_content = (
                     (await coro_or_task) if inspect.isawaitable(coro_or_task) else coro_or_task.result()
                 )
-            except exceptions.CallDeferred:
+            except exceptions.CallDeferred as e:
                 deferred_calls_by_index[index] = 'external'
-            except exceptions.ApprovalRequired:
+                deferred_metadata_by_index[index] = e.metadata
+            except exceptions.ApprovalRequired as e:
                 deferred_calls_by_index[index] = 'unapproved'
+                deferred_metadata_by_index[index] = e.metadata
             else:
                 tool_parts_by_index[index] = tool_part
                 if tool_user_content:
@@ -1024,7 +1031,10 @@ async def handle_call_or_result(
     output_parts.extend([user_parts_by_index[k] for k in sorted(user_parts_by_index)])
 
     for k in sorted(deferred_calls_by_index):
-        output_deferred_calls[deferred_calls_by_index[k]].append(tool_calls[k])
+        call = tool_calls[k]
+        output_deferred_calls[deferred_calls_by_index[k]].append(call)
+        if k in deferred_metadata_by_index:
+            output_deferred_metadata[call.tool_call_id] = deferred_metadata_by_index[k]
 
 
 async def _call_tool(
diff --git a/pydantic_ai_slim/pydantic_ai/exceptions.py b/pydantic_ai_slim/pydantic_ai/exceptions.py
index ae5cce0908..beff11f4da 100644
--- a/pydantic_ai_slim/pydantic_ai/exceptions.py
+++ b/pydantic_ai_slim/pydantic_ai/exceptions.py
@@ -67,18 +67,30 @@ class CallDeferred(Exception):
     """Exception to raise when a tool call should be deferred.
 
     See [tools docs](../deferred-tools.md#deferred-tools) for more information.
+
+    Args:
+        metadata: Optional dictionary of metadata to attach to the deferred tool call.
+            This metadata will be available in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
     """
 
-    pass
+    def __init__(self, metadata: dict[str, Any] | None = None):
+        self.metadata = metadata or {}
+        super().__init__()
 
 
 class ApprovalRequired(Exception):
     """Exception to raise when a tool call requires human-in-the-loop approval.
 
     See [tools docs](../deferred-tools.md#human-in-the-loop-tool-approval) for more information.
+
+    Args:
+        metadata: Optional dictionary of metadata to attach to the deferred tool call.
+            This metadata will be available in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
     """
 
-    pass
+    def __init__(self, metadata: dict[str, Any] | None = None):
+        self.metadata = metadata or {}
+        super().__init__()
 
 
 class UserError(RuntimeError):
diff --git a/pydantic_ai_slim/pydantic_ai/tools.py b/pydantic_ai_slim/pydantic_ai/tools.py
index da053a5191..beae26661b 100644
--- a/pydantic_ai_slim/pydantic_ai/tools.py
+++ b/pydantic_ai_slim/pydantic_ai/tools.py
@@ -147,6 +147,12 @@ class DeferredToolRequests:
     """Tool calls that require external execution."""
     approvals: list[ToolCallPart] = field(default_factory=list)
     """Tool calls that require human-in-the-loop approval."""
+    metadata: dict[str, dict[str, Any]] = field(default_factory=dict)
+    """Metadata for deferred tool calls, keyed by tool_call_id.
+
+    This contains any metadata that was provided when raising [`CallDeferred`][pydantic_ai.exceptions.CallDeferred]
+    or [`ApprovalRequired`][pydantic_ai.exceptions.ApprovalRequired] exceptions.
+    """
 
 
 @dataclass(kw_only=True)
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 8e575e4bfc..80518dfcec 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -988,13 +988,19 @@ async def test_evaluation_renderer_with_experiment_metadata(sample_report_case:
 │ temperature: 0.7                  │
 │ prompt_version: v2                │
 ╰───────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Inputs                    ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ {'query': 'What is 2+2?'} │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
-├───────────┼───────────────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
-│ Averages  │                           │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
-└───────────┴───────────────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
+┃          ┃          ┃           ┃          ┃           ┃ Assertio ┃          ┃
+┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃ ns       ┃ Duration ┃
+┡━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_ca… │ {'query' │ score1:   │ label1:  │ accuracy: │ ✔        │  100.0ms │
+│          │ : 'What  │ 2.50      │ hello    │ 0.950     │          │          │
+│          │ is       │           │          │           │          │          │
+│          │ 2+2?'}   │           │          │           │          │          │
+├──────────┼──────────┼───────────┼──────────┼───────────┼──────────┼──────────┤
+│ Averages │          │ score1:   │ label1:  │ accuracy: │ 100.0% ✔ │  100.0ms │
+│          │          │ 2.50      │ {'hello' │ 0.950     │          │          │
+│          │          │           │ : 1.0}   │           │          │          │
+└──────────┴──────────┴───────────┴──────────┴───────────┴──────────┴──────────┘
 """)
 
 
@@ -1048,11 +1054,12 @@ async def test_evaluation_renderer_with_long_experiment_metadata(sample_report_c
 │ frequency_penalty: 0.1                     │
 │ presence_penalty: 0.1                      │
 ╰────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1098,13 +1105,16 @@ async def test_evaluation_renderer_diff_with_experiment_metadata(sample_report_c
 │ model: gpt-4 → gpt-4o                           │
 │ temperature: 0.5 → 0.7                          │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
-├───────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
-│ Averages  │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
-└───────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+├───────────┼─────────────┼──────────────┼─────────────┼────────────┼──────────┤
+│ Averages  │ score1:     │ label1:      │ accuracy:   │ 100.0% ✔   │  100.0ms │
+│           │ 2.50        │ {'hello':    │ 0.950       │            │          │
+│           │             │ 1.0}         │             │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1150,11 +1160,12 @@ async def test_evaluation_renderer_diff_with_only_new_metadata(sample_report_cas
 │ + model: gpt-4o                                 │
 │ + temperature: 0.7                              │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1200,11 +1211,12 @@ async def test_evaluation_renderer_diff_with_only_baseline_metadata(sample_repor
 │ - model: gpt-4                                  │
 │ - temperature: 0.5                              │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1251,11 +1263,12 @@ async def test_evaluation_renderer_diff_with_same_metadata(sample_report_case: R
 │ model: gpt-4o                                   │
 │ temperature: 0.7                                │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1311,11 +1324,12 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case
 │ preserved-key: preserved value                  │
 │ updated-key: original value → updated value     │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
 
 
@@ -1355,10 +1369,11 @@ async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: Rep
         include_errors=False,  # Prevent failures table from being added
     )
     assert output == snapshot("""\
-                    Evaluation Diff: baseline_report → new_report                     \n\
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
-└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+                 Evaluation Diff: baseline_report → new_report                  \n\
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
+│           │ 2.50        │ hello        │ 0.950       │            │          │
+└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
 """)
diff --git a/tests/test_agent.py b/tests/test_agent.py
index 0a6bf1e325..9d690fcb03 100644
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -4857,9 +4857,13 @@ def call_second():
     else:
         result = agent.run_sync(user_prompt)
 
-    assert result.output == snapshot(
-        DeferredToolRequests(approvals=[ToolCallPart(tool_name='requires_approval', tool_call_id=IsStr())])
-    )
+    assert isinstance(result.output, DeferredToolRequests)
+    assert len(result.output.approvals) == 1
+    assert result.output.approvals[0].tool_name == 'requires_approval'
+    # Check metadata exists for this tool_call_id
+    tool_call_id = result.output.approvals[0].tool_call_id
+    assert tool_call_id in result.output.metadata
+    assert result.output.metadata[tool_call_id] == {}
     assert integer_holder == 2
 
 
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 1a126f26dc..0ff0c8e784 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -1149,9 +1149,13 @@ def regular_tool(x: int) -> int:
 
     async with agent.run_stream('test early strategy with external tool call') as result:
         response = await result.get_output()
-        assert response == snapshot(
-            DeferredToolRequests(calls=[ToolCallPart(tool_name='deferred_tool', tool_call_id=IsStr())])
-        )
+        assert isinstance(response, DeferredToolRequests)
+        assert len(response.calls) == 1
+        assert response.calls[0].tool_name == 'deferred_tool'
+        # Check metadata exists for this tool_call_id
+        tool_call_id = response.calls[0].tool_call_id
+        assert tool_call_id in response.metadata
+        assert response.metadata[tool_call_id] == {}
         messages = result.all_messages()
 
     # Verify no tools were called
@@ -1635,11 +1639,17 @@ def my_tool(x: int) -> int:
     async with agent.run_stream('Hello') as result:
         assert not result.is_complete
         assert [c async for c in result.stream_output(debounce_by=None)] == snapshot(
-            [DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])]
+            [
+                DeferredToolRequests(
+                    calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
+                    metadata={'pyd_ai_tool_call_id__my_tool': {}},
+                )
+            ]
         )
         assert await result.get_output() == snapshot(
             DeferredToolRequests(
                 calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
+                metadata={'pyd_ai_tool_call_id__my_tool': {}},
             )
         )
         responses = [c async for c, _is_last in result.stream_responses(debounce_by=None)]
@@ -1655,7 +1665,10 @@ def my_tool(x: int) -> int:
             ]
         )
         assert await result.validate_response_output(responses[0]) == snapshot(
-            DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])
+            DeferredToolRequests(
+                calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
+                metadata={'pyd_ai_tool_call_id__my_tool': {}},
+            )
         )
         assert result.usage() == snapshot(RunUsage(requests=1, input_tokens=51, output_tokens=0))
         assert result.timestamp() == IsNow(tz=timezone.utc)
@@ -1684,6 +1697,7 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
         assert output == snapshot(
             DeferredToolRequests(
                 approvals=[ToolCallPart(tool_name='my_tool', args='{"x": 1}', tool_call_id=IsStr())],
+                metadata={'my_tool': {}},
             )
         )
         assert result.is_complete
@@ -1859,6 +1873,7 @@ def my_other_tool(x: int) -> int:
         DeferredToolRequests(
             calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
             approvals=[ToolCallPart(tool_name='my_other_tool', args={'x': 0}, tool_call_id=IsStr())],
+            metadata={'pyd_ai_tool_call_id__my_tool': {}, 'pyd_ai_tool_call_id__my_other_tool': {}},
         )
     )
 
diff --git a/tests/test_tools.py b/tests/test_tools.py
index ea26d8ac91..20a43de912 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -1320,6 +1320,7 @@ def my_tool(x: int) -> int:
     assert result.output == snapshot(
         DeferredToolRequests(
             calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
+            metadata={'pyd_ai_tool_call_id__my_tool': {}},
         )
     )
 
@@ -1350,7 +1351,10 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
     result = agent.run_sync('Hello')
     messages = result.all_messages()
     assert result.output == snapshot(
-        DeferredToolRequests(approvals=[ToolCallPart(tool_name='my_tool', args={'x': 1}, tool_call_id='my_tool')])
+        DeferredToolRequests(
+            approvals=[ToolCallPart(tool_name='my_tool', args={'x': 1}, tool_call_id='my_tool')],
+            metadata={'my_tool': {}},
+        )
     )
 
     result = agent.run_sync(
@@ -1394,6 +1398,168 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
     assert result.output == snapshot('Done!')
 
 
+def test_call_deferred_with_metadata():
+    """Test that CallDeferred exception can carry metadata."""
+    agent = Agent(TestModel(), output_type=[str, DeferredToolRequests])
+
+    @agent.tool_plain
+    def my_tool(x: int) -> int:
+        raise CallDeferred(metadata={'task_id': 'task-123', 'estimated_cost': 25.50})
+
+    result = agent.run_sync('Hello')
+    assert isinstance(result.output, DeferredToolRequests)
+    assert len(result.output.calls) == 1
+
+    tool_call_id = result.output.calls[0].tool_call_id
+    assert tool_call_id in result.output.metadata
+    assert result.output.metadata[tool_call_id] == {'task_id': 'task-123', 'estimated_cost': 25.50}
+
+
+def test_approval_required_with_metadata():
+    """Test that ApprovalRequired exception can carry metadata."""
+
+    def llm(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
+        if len(messages) == 1:
+            return ModelResponse(
+                parts=[
+                    ToolCallPart('my_tool', {'x': 1}, tool_call_id='my_tool'),
+                ]
+            )
+        else:
+            return ModelResponse(
+                parts=[
+                    TextPart('Done!'),
+                ]
+            )
+
+    agent = Agent(FunctionModel(llm), output_type=[str, DeferredToolRequests])
+
+    @agent.tool
+    def my_tool(ctx: RunContext[None], x: int) -> int:
+        if not ctx.tool_call_approved:
+            raise ApprovalRequired(
+                metadata={
+                    'reason': 'High compute cost',
+                    'estimated_time': '5 minutes',
+                    'cost_usd': 100.0,
+                }
+            )
+        return x * 42
+
+    result = agent.run_sync('Hello')
+    assert isinstance(result.output, DeferredToolRequests)
+    assert len(result.output.approvals) == 1
+
+    assert 'my_tool' in result.output.metadata
+    assert result.output.metadata['my_tool'] == {
+        'reason': 'High compute cost',
+        'estimated_time': '5 minutes',
+        'cost_usd': 100.0,
+    }
+
+    # Continue with approval
+    messages = result.all_messages()
+    result = agent.run_sync(
+        message_history=messages,
+        deferred_tool_results=DeferredToolResults(approvals={'my_tool': ToolApproved()}),
+    )
+    assert result.output == 'Done!'
+
+
+def test_call_deferred_without_metadata():
+    """Test backward compatibility: CallDeferred without metadata still works."""
+    agent = Agent(TestModel(), output_type=[str, DeferredToolRequests])
+
+    @agent.tool_plain
+    def my_tool(x: int) -> int:
+        raise CallDeferred  # No metadata
+
+    result = agent.run_sync('Hello')
+    assert isinstance(result.output, DeferredToolRequests)
+    assert len(result.output.calls) == 1
+
+    tool_call_id = result.output.calls[0].tool_call_id
+    # Should have an empty metadata dict for this tool
+    assert result.output.metadata.get(tool_call_id, {}) == {}
+
+
+def test_approval_required_without_metadata():
+    """Test backward compatibility: ApprovalRequired without metadata still works."""
+
+    def llm(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
+        if len(messages) == 1:
+            return ModelResponse(
+                parts=[
+                    ToolCallPart('my_tool', {'x': 1}, tool_call_id='my_tool'),
+                ]
+            )
+        else:
+            return ModelResponse(
+                parts=[
+                    TextPart('Done!'),
+                ]
+            )
+
+    agent = Agent(FunctionModel(llm), output_type=[str, DeferredToolRequests])
+
+    @agent.tool
+    def my_tool(ctx: RunContext[None], x: int) -> int:
+        if not ctx.tool_call_approved:
+            raise ApprovalRequired  # No metadata
+        return x * 42
+
+    result = agent.run_sync('Hello')
+    assert isinstance(result.output, DeferredToolRequests)
+    assert len(result.output.approvals) == 1
+
+    # Should have an empty metadata dict for this tool
+    assert result.output.metadata.get('my_tool', {}) == {}
+
+
+def test_mixed_deferred_tools_with_metadata():
+    """Test multiple deferred tools with different metadata."""
+
+    def llm(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
+        if len(messages) == 1:
+            return ModelResponse(
+                parts=[
+                    ToolCallPart('tool_a', {'x': 1}, tool_call_id='call_a'),
+                    ToolCallPart('tool_b', {'y': 2}, tool_call_id='call_b'),
+                    ToolCallPart('tool_c', {'z': 3}, tool_call_id='call_c'),
+                ]
+            )
+        else:
+            return ModelResponse(parts=[TextPart('Done!')])
+
+    agent = Agent(FunctionModel(llm), output_type=[str, DeferredToolRequests])
+
+    @agent.tool
+    def tool_a(ctx: RunContext[None], x: int) -> int:
+        raise CallDeferred(metadata={'type': 'external', 'priority': 'high'})
+
+    @agent.tool
+    def tool_b(ctx: RunContext[None], y: int) -> int:
+        if not ctx.tool_call_approved:
+            raise ApprovalRequired(metadata={'reason': 'Needs approval', 'level': 'manager'})
+        return y * 10
+
+    @agent.tool
+    def tool_c(ctx: RunContext[None], z: int) -> int:
+        raise CallDeferred  # No metadata
+
+    result = agent.run_sync('Hello')
+    assert isinstance(result.output, DeferredToolRequests)
+
+    # Check that we have the right tools deferred
+    assert len(result.output.calls) == 2  # tool_a and tool_c
+    assert len(result.output.approvals) == 1  # tool_b
+
+    # Check metadata
+    assert result.output.metadata['call_a'] == {'type': 'external', 'priority': 'high'}
+    assert result.output.metadata['call_b'] == {'reason': 'Needs approval', 'level': 'manager'}
+    assert result.output.metadata.get('call_c', {}) == {}
+
+
 def test_deferred_tool_with_output_type():
     class MyModel(BaseModel):
         foo: str
@@ -1584,6 +1750,7 @@ def buy(fruit: str):
                 ToolCallPart(tool_name='buy', args={'fruit': 'banana'}, tool_call_id='buy_banana'),
                 ToolCallPart(tool_name='buy', args={'fruit': 'pear'}, tool_call_id='buy_pear'),
             ],
+            metadata={'buy_apple': {}, 'buy_banana': {}, 'buy_pear': {}},
         )
     )
 
@@ -1931,7 +2098,8 @@ def bar(x: int) -> int:
             approvals=[
                 ToolCallPart(tool_name='foo', args={'x': 1}, tool_call_id='foo1'),
                 ToolCallPart(tool_name='foo', args={'x': 2}, tool_call_id='foo2'),
-            ]
+            ],
+            metadata={'foo1': {}, 'foo2': {}},
         )
     )
 
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 38f9950ad5..a497d09389 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -439,7 +439,7 @@ async def test_run_stream_external_tools():
             '<request>',
             "<function-tool-call name='external_tool'>{}</function-tool-call>",
             '</request>',
-            "<run-result>DeferredToolRequests(calls=[ToolCallPart(tool_name='external_tool', args={}, tool_call_id='pyd_ai_tool_call_id__external_tool')], approvals=[])</run-result>",
+            "<run-result>DeferredToolRequests(calls=[ToolCallPart(tool_name='external_tool', args={}, tool_call_id='pyd_ai_tool_call_id__external_tool')], approvals=[], metadata={})</run-result>",
             '</stream>',
         ]
     )

From fbb04575461e9d335a64949ec260749b72a16118 Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Wed, 5 Nov 2025 13:26:19 -0600
Subject: [PATCH 2/9] Reduce complexity of _call_tools function

Extract metadata population logic into separate _populate_deferred_calls
helper function to reduce cyclomatic complexity from 16 to 15.
---
 pydantic_ai_slim/pydantic_ai/_agent_graph.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
index d2547490d2..094a259926 100644
--- a/pydantic_ai_slim/pydantic_ai/_agent_graph.py
+++ b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -1030,6 +1030,19 @@ async def handle_call_or_result(
     output_parts.extend([tool_parts_by_index[k] for k in sorted(tool_parts_by_index)])
     output_parts.extend([user_parts_by_index[k] for k in sorted(user_parts_by_index)])
 
+    _populate_deferred_calls(
+        tool_calls, deferred_calls_by_index, deferred_metadata_by_index, output_deferred_calls, output_deferred_metadata
+    )
+
+
+def _populate_deferred_calls(
+    tool_calls: list[_messages.ToolCallPart],
+    deferred_calls_by_index: dict[int, Literal['external', 'unapproved']],
+    deferred_metadata_by_index: dict[int, dict[str, Any]],
+    output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
+) -> None:
+    """Populate deferred calls and metadata from indexed mappings."""
     for k in sorted(deferred_calls_by_index):
         call = tool_calls[k]
         output_deferred_calls[deferred_calls_by_index[k]].append(call)

From 93c24ab4bdb959503bbacf134f32d43c461dbdba Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Thu, 6 Nov 2025 14:55:40 -0600
Subject: [PATCH 3/9] Fix typo in test_reporting.py snapshot

---
 tests/evals/test_reporting.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 80518dfcec..12bfd25a5d 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -988,9 +988,9 @@ async def test_evaluation_renderer_with_experiment_metadata(sample_report_case:
 │ temperature: 0.7                  │
 │ prompt_version: v2                │
 ╰───────────────────────────────────╯
-┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
-┃          ┃          ┃           ┃          ┃           ┃ Assertio ┃          ┃
-┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃ ns       ┃ Duration ┃
+┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃          ┃          ┃           ┃          ┃           ┃ Assertions ┃          ┃
+┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃            ┃ Duration ┃
 ┡━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
 │ test_ca… │ {'query' │ score1:   │ label1:  │ accuracy: │ ✔        │  100.0ms │
 │          │ : 'What  │ 2.50      │ hello    │ 0.950     │          │          │

From 42cf5b83a802d3eb08e59df76ab692afd25afbfd Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 15:53:17 -0600
Subject: [PATCH 4/9] Address Douwe's review feedback on deferred tool metadata

Per Douwe's comments:
1. Store None instead of {} when no metadata provided
2. Don't add tool_call_id to metadata dict when None
3. Update Temporal wrap/unwrap methods to handle metadata

- Updated test assertions to reflect None metadata behavior
- Updated doc example snapshots to show metadata={}
- Fixed codespell issue with table formatting
---
 docs/deferred-tools.md                        | 27 ++++++++++-------
 docs/toolsets.md                              |  1 +
 pydantic_ai_slim/pydantic_ai/_agent_graph.py  |  8 +++--
 .../durable_exec/temporal/_toolset.py         | 14 +++++----
 pydantic_ai_slim/pydantic_ai/exceptions.py    |  4 +--
 pyproject.toml                                |  2 +-
 tests/evals/test_reporting.py                 |  6 ++--
 tests/test_agent.py                           |  6 ++--
 tests/test_examples.py                        | 18 +++++++++++-
 tests/test_streaming.py                       | 29 +++++--------------
 tests/test_tools.py                           | 16 +++-------
 11 files changed, 68 insertions(+), 63 deletions(-)

diff --git a/docs/deferred-tools.md b/docs/deferred-tools.md
index b01d4d1f7b..8b2e052ab5 100644
--- a/docs/deferred-tools.md
+++ b/docs/deferred-tools.md
@@ -77,6 +77,7 @@ DeferredToolRequests(
             tool_call_id='delete_file',
         ),
     ],
+    metadata={},
 )
 """
 
@@ -247,6 +248,7 @@ async def main():
             )
         ],
         approvals=[],
+        metadata={},
     )
     """
 
@@ -385,17 +387,24 @@ requests = result.output
 # Handle approvals with metadata
 for call in requests.approvals:
     metadata = requests.metadata.get(call.tool_call_id, {})
-    print(f"Approval needed for {call.tool_name}")
-    print(f"  Cost: ${metadata.get('estimated_cost_usd')}")
-    print(f"  Time: {metadata.get('estimated_time_minutes')} minutes")
-    print(f"  Reason: {metadata.get('reason')}")
+    print(f'Approval needed for {call.tool_name}')
+    #> Approval needed for expensive_compute
+    print(f'  Cost: ${metadata.get("estimated_cost_usd")}')
+    #>   Cost: $25.5
+    print(f'  Time: {metadata.get("estimated_time_minutes")} minutes')
+    #>   Time: 15 minutes
+    print(f'  Reason: {metadata.get("reason")}')
+    #>   Reason: High compute cost
 
 # Handle external calls with metadata
 for call in requests.calls:
     metadata = requests.metadata.get(call.tool_call_id, {})
-    print(f"External call to {call.tool_name}")
-    print(f"  Task ID: {metadata.get('task_id')}")
-    print(f"  Priority: {metadata.get('priority')}")
+    print(f'External call to {call.tool_name}')
+    #> External call to external_api_call
+    print(f'  Task ID: {metadata.get("task_id")}')
+    #>   Task ID: api_call_external_api_call
+    print(f'  Priority: {metadata.get("priority")}')
+    #>   Priority: high
 
 # Build results with approvals and external results
 results = DeferredToolResults()
@@ -416,9 +425,7 @@ for call in requests.calls:
 
 result = agent.run_sync(message_history=messages, deferred_tool_results=results)
 print(result.output)
-"""
-I completed task-123 and retrieved data from the /data endpoint.
-"""
+#> I completed task-123 and retrieved data from the /data endpoint.
 ```
 
 _(This example is complete, it can be run "as is")_
diff --git a/docs/toolsets.md b/docs/toolsets.md
index 8d970b8e31..1b041b3baa 100644
--- a/docs/toolsets.md
+++ b/docs/toolsets.md
@@ -362,6 +362,7 @@ DeferredToolRequests(
             tool_call_id='pyd_ai_tool_call_id__temperature_fahrenheit',
         ),
     ],
+    metadata={},
 )
 """
 
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
index 094a259926..9481491716 100644
--- a/pydantic_ai_slim/pydantic_ai/_agent_graph.py
+++ b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -952,7 +952,7 @@ async def _call_tools(
     tool_parts_by_index: dict[int, _messages.ModelRequestPart] = {}
     user_parts_by_index: dict[int, _messages.UserPromptPart] = {}
     deferred_calls_by_index: dict[int, Literal['external', 'unapproved']] = {}
-    deferred_metadata_by_index: dict[int, dict[str, Any]] = {}
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None] = {}
 
     if usage_limits.tool_calls_limit is not None:
         projected_usage = deepcopy(usage)
@@ -1038,7 +1038,7 @@ async def handle_call_or_result(
 def _populate_deferred_calls(
     tool_calls: list[_messages.ToolCallPart],
     deferred_calls_by_index: dict[int, Literal['external', 'unapproved']],
-    deferred_metadata_by_index: dict[int, dict[str, Any]],
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None],
     output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
     output_deferred_metadata: dict[str, dict[str, Any]],
 ) -> None:
@@ -1047,7 +1047,9 @@ def _populate_deferred_calls(
         call = tool_calls[k]
         output_deferred_calls[deferred_calls_by_index[k]].append(call)
         if k in deferred_metadata_by_index:
-            output_deferred_metadata[call.tool_call_id] = deferred_metadata_by_index[k]
+            metadata = deferred_metadata_by_index[k]
+            if metadata is not None:
+                output_deferred_metadata[call.tool_call_id] = metadata
 
 
 async def _call_tool(
diff --git a/pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py b/pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py
index d4adb4b6a7..a85b35ee4a 100644
--- a/pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py
+++ b/pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py
@@ -27,11 +27,13 @@ class CallToolParams:
 
 @dataclass
 class _ApprovalRequired:
+    metadata: dict[str, Any] | None = None
     kind: Literal['approval_required'] = 'approval_required'
 
 
 @dataclass
 class _CallDeferred:
+    metadata: dict[str, Any] | None = None
     kind: Literal['call_deferred'] = 'call_deferred'
 
 
@@ -75,10 +77,10 @@ async def _wrap_call_tool_result(self, coro: Awaitable[Any]) -> CallToolResult:
         try:
             result = await coro
             return _ToolReturn(result=result)
-        except ApprovalRequired:
-            return _ApprovalRequired()
-        except CallDeferred:
-            return _CallDeferred()
+        except ApprovalRequired as e:
+            return _ApprovalRequired(metadata=e.metadata)
+        except CallDeferred as e:
+            return _CallDeferred(metadata=e.metadata)
         except ModelRetry as e:
             return _ModelRetry(message=e.message)
 
@@ -86,9 +88,9 @@ def _unwrap_call_tool_result(self, result: CallToolResult) -> Any:
         if isinstance(result, _ToolReturn):
             return result.result
         elif isinstance(result, _ApprovalRequired):
-            raise ApprovalRequired()
+            raise ApprovalRequired(metadata=result.metadata)
         elif isinstance(result, _CallDeferred):
-            raise CallDeferred()
+            raise CallDeferred(metadata=result.metadata)
         elif isinstance(result, _ModelRetry):
             raise ModelRetry(result.message)
         else:
diff --git a/pydantic_ai_slim/pydantic_ai/exceptions.py b/pydantic_ai_slim/pydantic_ai/exceptions.py
index beff11f4da..2e8358c2fc 100644
--- a/pydantic_ai_slim/pydantic_ai/exceptions.py
+++ b/pydantic_ai_slim/pydantic_ai/exceptions.py
@@ -74,7 +74,7 @@ class CallDeferred(Exception):
     """
 
     def __init__(self, metadata: dict[str, Any] | None = None):
-        self.metadata = metadata or {}
+        self.metadata = metadata
         super().__init__()
 
 
@@ -89,7 +89,7 @@ class ApprovalRequired(Exception):
     """
 
     def __init__(self, metadata: dict[str, Any] | None = None):
-        self.metadata = metadata or {}
+        self.metadata = metadata
         super().__init__()
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 3c13afdece..1facbf7e05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -317,4 +317,4 @@ skip = '.git*,*.svg,*.lock,*.css,*.yaml'
 check-hidden = true
 # Ignore "formatting" like **L**anguage
 ignore-regex = '\*\*[A-Z]\*\*[a-z]+\b'
-ignore-words-list = 'asend,aci'
+ignore-words-list = 'asend,aci,Assertio'
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 12bfd25a5d..80518dfcec 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -988,9 +988,9 @@ async def test_evaluation_renderer_with_experiment_metadata(sample_report_case:
 │ temperature: 0.7                  │
 │ prompt_version: v2                │
 ╰───────────────────────────────────╯
-┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃          ┃          ┃           ┃          ┃           ┃ Assertions ┃          ┃
-┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃            ┃ Duration ┃
+┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
+┃          ┃          ┃           ┃          ┃           ┃ Assertio ┃          ┃
+┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃ ns       ┃ Duration ┃
 ┡━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
 │ test_ca… │ {'query' │ score1:   │ label1:  │ accuracy: │ ✔        │  100.0ms │
 │          │ : 'What  │ 2.50      │ hello    │ 0.950     │          │          │
diff --git a/tests/test_agent.py b/tests/test_agent.py
index 9d690fcb03..2b89c3f38b 100644
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -4860,10 +4860,10 @@ def call_second():
     assert isinstance(result.output, DeferredToolRequests)
     assert len(result.output.approvals) == 1
     assert result.output.approvals[0].tool_name == 'requires_approval'
-    # Check metadata exists for this tool_call_id
+    # When no metadata is provided, the tool_call_id should not be in metadata dict
     tool_call_id = result.output.approvals[0].tool_call_id
-    assert tool_call_id in result.output.metadata
-    assert result.output.metadata[tool_call_id] == {}
+    assert tool_call_id not in result.output.metadata
+    assert result.output.metadata == {}
     assert integer_holder == 2
 
 
diff --git a/tests/test_examples.py b/tests/test_examples.py
index c7c32c340d..5a5cac7ac6 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -523,6 +523,10 @@ async def call_tool(
     'Tell me about the pydantic/pydantic-ai repo.': 'The pydantic/pydantic-ai repo is a Python agent framework for building Generative AI applications.',
     'What do I have on my calendar today?': "You're going to spend all day playing with Pydantic AI.",
     'Write a long story about a cat': 'Once upon a time, there was a curious cat named Whiskers who loved to explore the world around him...',
+    'Run expensive task-123 and call the /data endpoint': [
+        ToolCallPart(tool_name='expensive_compute', args={'task_id': 'task-123'}, tool_call_id='expensive_compute'),
+        ToolCallPart(tool_name='external_api_call', args={'endpoint': '/data'}, tool_call_id='external_api_call'),
+    ],
 }
 
 tool_responses: dict[tuple[str, str], str] = {
@@ -871,10 +875,22 @@ async def model_logic(  # noqa: C901
         return ModelResponse(
             parts=[TextPart('The answer to the ultimate question of life, the universe, and everything is 42.')]
         )
-    else:
+    elif isinstance(m, ToolReturnPart) and m.tool_name in ('expensive_compute', 'external_api_call'):
+        # After deferred tools complete, check if we have all results to provide final response
+        tool_names = {part.tool_name for msg in messages for part in msg.parts if isinstance(part, ToolReturnPart)}
+        if 'expensive_compute' in tool_names and 'external_api_call' in tool_names:
+            return ModelResponse(parts=[TextPart('I completed task-123 and retrieved data from the /data endpoint.')])
+        # If we don't have both results yet, just acknowledge the tool result
+        return ModelResponse(parts=[TextPart(f'Received result from {m.tool_name}')])
+
+    if isinstance(m, ToolReturnPart):
         sys.stdout.write(str(debug.format(messages, info)))
         raise RuntimeError(f'Unexpected message: {m}')
 
+    # Fallback for any other message type
+    sys.stdout.write(str(debug.format(messages, info)))
+    raise RuntimeError(f'Unexpected message type: {type(m).__name__}')
+
 
 async def stream_model_logic(  # noqa C901
     messages: list[ModelMessage], info: AgentInfo
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 0ff0c8e784..d66d5510e0 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -1152,10 +1152,10 @@ def regular_tool(x: int) -> int:
         assert isinstance(response, DeferredToolRequests)
         assert len(response.calls) == 1
         assert response.calls[0].tool_name == 'deferred_tool'
-        # Check metadata exists for this tool_call_id
+        # When no metadata is provided, the tool_call_id should not be in metadata dict
         tool_call_id = response.calls[0].tool_call_id
-        assert tool_call_id in response.metadata
-        assert response.metadata[tool_call_id] == {}
+        assert tool_call_id not in response.metadata
+        assert response.metadata == {}
         messages = result.all_messages()
 
     # Verify no tools were called
@@ -1639,18 +1639,10 @@ def my_tool(x: int) -> int:
     async with agent.run_stream('Hello') as result:
         assert not result.is_complete
         assert [c async for c in result.stream_output(debounce_by=None)] == snapshot(
-            [
-                DeferredToolRequests(
-                    calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
-                    metadata={'pyd_ai_tool_call_id__my_tool': {}},
-                )
-            ]
+            [DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])]
         )
         assert await result.get_output() == snapshot(
-            DeferredToolRequests(
-                calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
-                metadata={'pyd_ai_tool_call_id__my_tool': {}},
-            )
+            DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])
         )
         responses = [c async for c, _is_last in result.stream_responses(debounce_by=None)]
         assert responses == snapshot(
@@ -1665,10 +1657,7 @@ def my_tool(x: int) -> int:
             ]
         )
         assert await result.validate_response_output(responses[0]) == snapshot(
-            DeferredToolRequests(
-                calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
-                metadata={'pyd_ai_tool_call_id__my_tool': {}},
-            )
+            DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])
         )
         assert result.usage() == snapshot(RunUsage(requests=1, input_tokens=51, output_tokens=0))
         assert result.timestamp() == IsNow(tz=timezone.utc)
@@ -1695,10 +1684,7 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
         messages = result.all_messages()
         output = await result.get_output()
         assert output == snapshot(
-            DeferredToolRequests(
-                approvals=[ToolCallPart(tool_name='my_tool', args='{"x": 1}', tool_call_id=IsStr())],
-                metadata={'my_tool': {}},
-            )
+            DeferredToolRequests(approvals=[ToolCallPart(tool_name='my_tool', args='{"x": 1}', tool_call_id=IsStr())])
         )
         assert result.is_complete
 
@@ -1873,7 +1859,6 @@ def my_other_tool(x: int) -> int:
         DeferredToolRequests(
             calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
             approvals=[ToolCallPart(tool_name='my_other_tool', args={'x': 0}, tool_call_id=IsStr())],
-            metadata={'pyd_ai_tool_call_id__my_tool': {}, 'pyd_ai_tool_call_id__my_other_tool': {}},
         )
     )
 
diff --git a/tests/test_tools.py b/tests/test_tools.py
index 20a43de912..9d0739d33b 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -1318,10 +1318,7 @@ def my_tool(x: int) -> int:
 
     result = agent.run_sync('Hello')
     assert result.output == snapshot(
-        DeferredToolRequests(
-            calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())],
-            metadata={'pyd_ai_tool_call_id__my_tool': {}},
-        )
+        DeferredToolRequests(calls=[ToolCallPart(tool_name='my_tool', args={'x': 0}, tool_call_id=IsStr())])
     )
 
 
@@ -1351,10 +1348,7 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
     result = agent.run_sync('Hello')
     messages = result.all_messages()
     assert result.output == snapshot(
-        DeferredToolRequests(
-            approvals=[ToolCallPart(tool_name='my_tool', args={'x': 1}, tool_call_id='my_tool')],
-            metadata={'my_tool': {}},
-        )
+        DeferredToolRequests(approvals=[ToolCallPart(tool_name='my_tool', args={'x': 1}, tool_call_id='my_tool')])
     )
 
     result = agent.run_sync(
@@ -1749,8 +1743,7 @@ def buy(fruit: str):
                 ToolCallPart(tool_name='buy', args={'fruit': 'apple'}, tool_call_id='buy_apple'),
                 ToolCallPart(tool_name='buy', args={'fruit': 'banana'}, tool_call_id='buy_banana'),
                 ToolCallPart(tool_name='buy', args={'fruit': 'pear'}, tool_call_id='buy_pear'),
-            ],
-            metadata={'buy_apple': {}, 'buy_banana': {}, 'buy_pear': {}},
+            ]
         )
     )
 
@@ -2098,8 +2091,7 @@ def bar(x: int) -> int:
             approvals=[
                 ToolCallPart(tool_name='foo', args={'x': 1}, tool_call_id='foo1'),
                 ToolCallPart(tool_name='foo', args={'x': 2}, tool_call_id='foo2'),
-            ],
-            metadata={'foo1': {}, 'foo2': {}},
+            ]
         )
     )
 

From c7aaca95a1afab71998fa2245d191bb4e2b10477 Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 16:01:52 -0600
Subject: [PATCH 5/9] Improve deferred tools metadata documentation example

- Replace contrived task_id parameter with realistic tool signatures
- Add ComputeDeps class demonstrating dependency injection pattern
- Show using ctx.deps to compute metadata from tool arguments
- Remove incorrect statement about backwards compatibility
- Update test_examples.py to match new realistic example
---
 docs/deferred-tools.md | 128 +++++++++++++++++++++++++----------------
 tests/test_examples.py |  26 +++++++--
 2 files changed, 97 insertions(+), 57 deletions(-)

diff --git a/docs/deferred-tools.md b/docs/deferred-tools.md
index 8b2e052ab5..f6915b394e 100644
--- a/docs/deferred-tools.md
+++ b/docs/deferred-tools.md
@@ -326,16 +326,20 @@ _(This example is complete, it can be run "as is" — you'll need to add `asynci
 
 Both [`CallDeferred`][pydantic_ai.exceptions.CallDeferred] and [`ApprovalRequired`][pydantic_ai.exceptions.ApprovalRequired] exceptions accept an optional `metadata` parameter that allows you to attach arbitrary context information to deferred tool calls. This metadata is then available in the [`DeferredToolRequests.metadata`][pydantic_ai.tools.DeferredToolRequests.metadata] dictionary, keyed by the tool call ID.
 
+A common pattern is to use [`RunContext`][pydantic_ai.tools.RunContext] to access application dependencies (databases, APIs, calculators) and compute metadata based on the tool arguments and current context. This allows you to provide rich information for approval decisions or external task tracking.
+
 Common use cases for metadata include:
 
-- Providing cost estimates or time estimates for approval decisions
-- Including task IDs or tracking information for external execution
-- Storing context about why approval is required
-- Attaching priority or urgency information
+- Computing cost estimates based on tool arguments and dependency services
+- Including job IDs or tracking information for external execution systems
+- Storing approval context like user permissions or resource availability
+- Attaching priority levels computed from current system state
 
-Here's an example showing how to use metadata with both approval-required and external tools:
+Here's an example showing how to use metadata with deps to make informed approval decisions:
 
 ```python {title="deferred_tools_with_metadata.py"}
+from dataclasses import dataclass
+
 from pydantic_ai import (
     Agent,
     ApprovalRequired,
@@ -347,91 +351,113 @@ from pydantic_ai import (
     ToolDenied,
 )
 
-agent = Agent('openai:gpt-5', output_type=[str, DeferredToolRequests])
+
+@dataclass
+class ComputeDeps:
+    """Dependencies providing cost estimation and job scheduling."""
+
+    def estimate_cost(self, dataset: str, model_type: str) -> float:
+        # In real code, query pricing API or database
+        costs = {'gpt-4': 50.0, 'gpt-3.5': 10.0}
+        return costs.get(model_type, 25.0)
+
+    def estimate_duration(self, dataset: str) -> int:
+        # In real code, estimate based on dataset size
+        return 30 if dataset == 'large_dataset' else 5
+
+    def submit_job(self, dataset: str, model_type: str) -> str:
+        # In real code, submit to batch processing system
+        return f'job_{dataset}_{model_type}'
+
+
+agent = Agent(
+    'openai:gpt-5',
+    deps_type=ComputeDeps,
+    output_type=[str, DeferredToolRequests],
+)
 
 
 @agent.tool
-def expensive_compute(ctx: RunContext, task_id: str) -> str:
+def train_model(ctx: RunContext[ComputeDeps], dataset: str, model_type: str) -> str:
+    """Train ML model - requires approval for expensive models."""
     if not ctx.tool_call_approved:
+        # Use deps to compute actual estimates based on args
+        cost = ctx.deps.estimate_cost(dataset, model_type)
+        duration = ctx.deps.estimate_duration(dataset)
+
         raise ApprovalRequired(
             metadata={
-                'task_id': task_id,
-                'estimated_cost_usd': 25.50,
-                'estimated_time_minutes': 15,
-                'reason': 'High compute cost',
+                'dataset': dataset,
+                'model_type': model_type,
+                'estimated_cost_usd': cost,
+                'estimated_duration_minutes': duration,
             }
         )
-    return f'Task {task_id} completed'
+
+    return f'Model {model_type} trained on {dataset}'
 
 
 @agent.tool
-async def external_api_call(ctx: RunContext, endpoint: str) -> str:
-    # Schedule the external API call and defer execution
-    task_id = f'api_call_{ctx.tool_call_id}'
+def process_dataset(ctx: RunContext[ComputeDeps], dataset: str, operation: str) -> str:
+    """Process dataset in external batch system."""
+    # Submit job and defer execution
+    job_id = ctx.deps.submit_job(dataset, operation)
 
     raise CallDeferred(
         metadata={
-            'task_id': task_id,
-            'endpoint': endpoint,
-            'priority': 'high',
+            'job_id': job_id,
+            'dataset': dataset,
+            'operation': operation,
         }
     )
 
 
-result = agent.run_sync('Run expensive task-123 and call the /data endpoint')
+deps = ComputeDeps()
+result = agent.run_sync(
+    'Train gpt-4 on large_dataset and process large_dataset with transform',
+    deps=deps,
+)
 messages = result.all_messages()
 
 assert isinstance(result.output, DeferredToolRequests)
 requests = result.output
 
-# Handle approvals with metadata
-for call in requests.approvals:
-    metadata = requests.metadata.get(call.tool_call_id, {})
-    print(f'Approval needed for {call.tool_name}')
-    #> Approval needed for expensive_compute
-    print(f'  Cost: ${metadata.get("estimated_cost_usd")}')
-    #>   Cost: $25.5
-    print(f'  Time: {metadata.get("estimated_time_minutes")} minutes')
-    #>   Time: 15 minutes
-    print(f'  Reason: {metadata.get("reason")}')
-    #>   Reason: High compute cost
-
-# Handle external calls with metadata
-for call in requests.calls:
-    metadata = requests.metadata.get(call.tool_call_id, {})
-    print(f'External call to {call.tool_name}')
-    #> External call to external_api_call
-    print(f'  Task ID: {metadata.get("task_id")}')
-    #>   Task ID: api_call_external_api_call
-    print(f'  Priority: {metadata.get("priority")}')
-    #>   Priority: high
-
-# Build results with approvals and external results
+# Make approval decisions based on metadata
 results = DeferredToolResults()
 for call in requests.approvals:
     metadata = requests.metadata.get(call.tool_call_id, {})
     cost = metadata.get('estimated_cost_usd', 0)
 
-    if cost < 50:  # Approve if cost is under $50
+    print(f'Approval needed: {call.tool_name}')
+    #> Approval needed: train_model
+    print(f'  Model: {metadata.get("model_type")}, Cost: ${cost}')
+    #>   Model: gpt-4, Cost: $50.0
+
+    if cost < 100:
         results.approvals[call.tool_call_id] = ToolApproved()
     else:
-        results.approvals[call.tool_call_id] = ToolDenied('Cost too high')
+        results.approvals[call.tool_call_id] = ToolDenied('Cost exceeds limit')
 
+# Process external jobs using metadata
 for call in requests.calls:
     metadata = requests.metadata.get(call.tool_call_id, {})
-    # Simulate getting result from external task
-    task_id = metadata.get('task_id')
-    results.calls[call.tool_call_id] = f'Result from {task_id}: success'
+    job_id = metadata.get('job_id')
 
-result = agent.run_sync(message_history=messages, deferred_tool_results=results)
+    print(f'External job: {job_id}')
+    #> External job: job_large_dataset_transform
+
+    # In real code, poll job status and get result
+    results.calls[call.tool_call_id] = f'Completed {job_id}'
+
+result = agent.run_sync(message_history=messages, deferred_tool_results=results, deps=deps)
 print(result.output)
-#> I completed task-123 and retrieved data from the /data endpoint.
+"""
+Model gpt-4 trained on large_dataset and dataset processing job job_large_dataset_transform completed
+"""
 ```
 
 _(This example is complete, it can be run "as is")_
 
-The metadata dictionary can contain any JSON-serializable values and is entirely application-defined. If no metadata is provided when raising the exception, the tool call ID will still be present in the `metadata` dictionary with an empty dict as the value for backward compatibility.
-
 ## See Also
 
 - [Function Tools](tools.md) - Basic tool concepts and registration
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 5a5cac7ac6..caa1eeba3c 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -523,9 +523,17 @@ async def call_tool(
     'Tell me about the pydantic/pydantic-ai repo.': 'The pydantic/pydantic-ai repo is a Python agent framework for building Generative AI applications.',
     'What do I have on my calendar today?': "You're going to spend all day playing with Pydantic AI.",
     'Write a long story about a cat': 'Once upon a time, there was a curious cat named Whiskers who loved to explore the world around him...',
-    'Run expensive task-123 and call the /data endpoint': [
-        ToolCallPart(tool_name='expensive_compute', args={'task_id': 'task-123'}, tool_call_id='expensive_compute'),
-        ToolCallPart(tool_name='external_api_call', args={'endpoint': '/data'}, tool_call_id='external_api_call'),
+    'Train gpt-4 on large_dataset and process large_dataset with transform': [
+        ToolCallPart(
+            tool_name='train_model',
+            args={'dataset': 'large_dataset', 'model_type': 'gpt-4'},
+            tool_call_id='train_model',
+        ),
+        ToolCallPart(
+            tool_name='process_dataset',
+            args={'dataset': 'large_dataset', 'operation': 'transform'},
+            tool_call_id='process_dataset',
+        ),
     ],
 }
 
@@ -875,11 +883,17 @@ async def model_logic(  # noqa: C901
         return ModelResponse(
             parts=[TextPart('The answer to the ultimate question of life, the universe, and everything is 42.')]
         )
-    elif isinstance(m, ToolReturnPart) and m.tool_name in ('expensive_compute', 'external_api_call'):
+    elif isinstance(m, ToolReturnPart) and m.tool_name in ('train_model', 'process_dataset'):
         # After deferred tools complete, check if we have all results to provide final response
         tool_names = {part.tool_name for msg in messages for part in msg.parts if isinstance(part, ToolReturnPart)}
-        if 'expensive_compute' in tool_names and 'external_api_call' in tool_names:
-            return ModelResponse(parts=[TextPart('I completed task-123 and retrieved data from the /data endpoint.')])
+        if 'train_model' in tool_names and 'process_dataset' in tool_names:
+            return ModelResponse(
+                parts=[
+                    TextPart(
+                        'Model gpt-4 trained on large_dataset and dataset processing job job_large_dataset_transform completed'
+                    )
+                ]
+            )
         # If we don't have both results yet, just acknowledge the tool result
         return ModelResponse(parts=[TextPart(f'Received result from {m.tool_name}')])
 

From b23609acef2baaff3ee5d15458844d368461c93b Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 16:25:02 -0600
Subject: [PATCH 6/9] Fix evaluation reporting snapshots for CI (COLUMNS=150)

---
 tests/evals/test_reporting.py | 105 +++++++++++++++-------------------
 1 file changed, 45 insertions(+), 60 deletions(-)

diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 80518dfcec..8e575e4bfc 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -988,19 +988,13 @@ async def test_evaluation_renderer_with_experiment_metadata(sample_report_case:
 │ temperature: 0.7                  │
 │ prompt_version: v2                │
 ╰───────────────────────────────────╯
-┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
-┃          ┃          ┃           ┃          ┃           ┃ Assertio ┃          ┃
-┃ Case ID  ┃ Inputs   ┃ Scores    ┃ Labels   ┃ Metrics   ┃ ns       ┃ Duration ┃
-┡━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_ca… │ {'query' │ score1:   │ label1:  │ accuracy: │ ✔        │  100.0ms │
-│          │ : 'What  │ 2.50      │ hello    │ 0.950     │          │          │
-│          │ is       │           │          │           │          │          │
-│          │ 2+2?'}   │           │          │           │          │          │
-├──────────┼──────────┼───────────┼──────────┼───────────┼──────────┼──────────┤
-│ Averages │          │ score1:   │ label1:  │ accuracy: │ 100.0% ✔ │  100.0ms │
-│          │          │ 2.50      │ {'hello' │ 0.950     │          │          │
-│          │          │           │ : 1.0}   │           │          │          │
-└──────────┴──────────┴───────────┴──────────┴───────────┴──────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
+├───────────┼───────────────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
+│ Averages  │                           │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
+└───────────┴───────────────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1054,12 +1048,11 @@ async def test_evaluation_renderer_with_long_experiment_metadata(sample_report_c
 │ frequency_penalty: 0.1                     │
 │ presence_penalty: 0.1                      │
 ╰────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1105,16 +1098,13 @@ async def test_evaluation_renderer_diff_with_experiment_metadata(sample_report_c
 │ model: gpt-4 → gpt-4o                           │
 │ temperature: 0.5 → 0.7                          │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-├───────────┼─────────────┼──────────────┼─────────────┼────────────┼──────────┤
-│ Averages  │ score1:     │ label1:      │ accuracy:   │ 100.0% ✔   │  100.0ms │
-│           │ 2.50        │ {'hello':    │ 0.950       │            │          │
-│           │             │ 1.0}         │             │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
+├───────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
+│ Averages  │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
+└───────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1160,12 +1150,11 @@ async def test_evaluation_renderer_diff_with_only_new_metadata(sample_report_cas
 │ + model: gpt-4o                                 │
 │ + temperature: 0.7                              │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1211,12 +1200,11 @@ async def test_evaluation_renderer_diff_with_only_baseline_metadata(sample_repor
 │ - model: gpt-4                                  │
 │ - temperature: 0.5                              │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1263,12 +1251,11 @@ async def test_evaluation_renderer_diff_with_same_metadata(sample_report_case: R
 │ model: gpt-4o                                   │
 │ temperature: 0.7                                │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1324,12 +1311,11 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case
 │ preserved-key: preserved value                  │
 │ updated-key: original value → updated value     │
 ╰─────────────────────────────────────────────────╯
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
 
 
@@ -1369,11 +1355,10 @@ async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: Rep
         include_errors=False,  # Prevent failures table from being added
     )
     assert output == snapshot("""\
-                 Evaluation Diff: baseline_report → new_report                  \n\
-┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ Case ID   ┃ Scores      ┃ Labels       ┃ Metrics     ┃ Assertions ┃ Duration ┃
-┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
-│ test_case │ score1:     │ label1:      │ accuracy:   │ ✔          │  100.0ms │
-│           │ 2.50        │ hello        │ 0.950       │            │          │
-└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘
+                    Evaluation Diff: baseline_report → new_report                     \n\
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)

From d4f16137ce7c8229808500a4f76e37664cbb6d9b Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 16:55:49 -0600
Subject: [PATCH 7/9] Replace ML training example with flight booking example

---
 docs/deferred-tools.md | 131 ++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 61 deletions(-)

diff --git a/docs/deferred-tools.md b/docs/deferred-tools.md
index f6915b394e..bbcfe07ea1 100644
--- a/docs/deferred-tools.md
+++ b/docs/deferred-tools.md
@@ -324,18 +324,9 @@ _(This example is complete, it can be run "as is" — you'll need to add `asynci
 
 ## Attaching Metadata to Deferred Tools
 
-Both [`CallDeferred`][pydantic_ai.exceptions.CallDeferred] and [`ApprovalRequired`][pydantic_ai.exceptions.ApprovalRequired] exceptions accept an optional `metadata` parameter that allows you to attach arbitrary context information to deferred tool calls. This metadata is then available in the [`DeferredToolRequests.metadata`][pydantic_ai.tools.DeferredToolRequests.metadata] dictionary, keyed by the tool call ID.
+Both [`CallDeferred`][pydantic_ai.exceptions.CallDeferred] and [`ApprovalRequired`][pydantic_ai.exceptions.ApprovalRequired] exceptions accept an optional `metadata` parameter that allows you to attach arbitrary context information to deferred tool calls. This metadata is available in [`DeferredToolRequests.metadata`][pydantic_ai.tools.DeferredToolRequests.metadata] keyed by tool call ID.
 
-A common pattern is to use [`RunContext`][pydantic_ai.tools.RunContext] to access application dependencies (databases, APIs, calculators) and compute metadata based on the tool arguments and current context. This allows you to provide rich information for approval decisions or external task tracking.
-
-Common use cases for metadata include:
-
-- Computing cost estimates based on tool arguments and dependency services
-- Including job IDs or tracking information for external execution systems
-- Storing approval context like user permissions or resource availability
-- Attaching priority levels computed from current system state
-
-Here's an example showing how to use metadata with deps to make informed approval decisions:
+Common use cases include cost estimates for approval decisions and tracking information for external systems.
 
 ```python {title="deferred_tools_with_metadata.py"}
 from dataclasses import dataclass
@@ -353,68 +344,82 @@ from pydantic_ai import (
 
 
 @dataclass
-class ComputeDeps:
-    """Dependencies providing cost estimation and job scheduling."""
+class User:
+    home_location: str = "St. Louis, MO"
+
 
-    def estimate_cost(self, dataset: str, model_type: str) -> float:
-        # In real code, query pricing API or database
-        costs = {'gpt-4': 50.0, 'gpt-3.5': 10.0}
-        return costs.get(model_type, 25.0)
+class FlightAPI:
+    COSTS = {
+        ("St. Louis, MO", "Lisbon, Portugal"): 850,
+        ("St. Louis, MO", "Santiago, Chile"): 1200,
+        ("St. Louis, MO", "Los Angeles, CA"): 300,
+    }
 
-    def estimate_duration(self, dataset: str) -> int:
-        # In real code, estimate based on dataset size
-        return 30 if dataset == 'large_dataset' else 5
+    def get_flight_cost(self, origin: str, destination: str) -> int:
+        return self.COSTS.get((origin, destination), 500)
 
-    def submit_job(self, dataset: str, model_type: str) -> str:
-        # In real code, submit to batch processing system
-        return f'job_{dataset}_{model_type}'
+    def get_airline_auth_url(self, airline: str) -> str:
+        # In real code, this might generate a proper OAuth URL
+        return f"https://example.com/auth/{airline.lower().replace(' ', '-')}"
+
+
+@dataclass
+class TravelDeps:
+    user: User
+    flight_api: FlightAPI
 
 
 agent = Agent(
     'openai:gpt-5',
-    deps_type=ComputeDeps,
+    deps_type=TravelDeps,
     output_type=[str, DeferredToolRequests],
 )
 
 
 @agent.tool
-def train_model(ctx: RunContext[ComputeDeps], dataset: str, model_type: str) -> str:
-    """Train ML model - requires approval for expensive models."""
+def book_flight(ctx: RunContext[TravelDeps], destination: str) -> str:
+    """Book a flight to the destination."""
     if not ctx.tool_call_approved:
-        # Use deps to compute actual estimates based on args
-        cost = ctx.deps.estimate_cost(dataset, model_type)
-        duration = ctx.deps.estimate_duration(dataset)
+        # Look up cost based on user's location and destination
+        cost = ctx.deps.flight_api.get_flight_cost(
+            ctx.deps.user.home_location,
+            destination
+        )
 
         raise ApprovalRequired(
             metadata={
-                'dataset': dataset,
-                'model_type': model_type,
-                'estimated_cost_usd': cost,
-                'estimated_duration_minutes': duration,
+                'origin': ctx.deps.user.home_location,
+                'destination': destination,
+                'cost_usd': cost,
             }
         )
 
-    return f'Model {model_type} trained on {dataset}'
+    return f"Flight booked to {destination}"
 
 
 @agent.tool
-def process_dataset(ctx: RunContext[ComputeDeps], dataset: str, operation: str) -> str:
-    """Process dataset in external batch system."""
-    # Submit job and defer execution
-    job_id = ctx.deps.submit_job(dataset, operation)
+def authenticate_with_airline(ctx: RunContext[TravelDeps], airline: str) -> str:
+    """Authenticate with airline website to link frequent flyer account."""
+    # Generate auth URL that would normally open in browser
+    auth_url = ctx.deps.flight_api.get_airline_auth_url(airline)
 
+    # Cannot complete auth in this process - need user interaction
     raise CallDeferred(
         metadata={
-            'job_id': job_id,
-            'dataset': dataset,
-            'operation': operation,
+            'airline': airline,
+            'auth_url': auth_url,
         }
     )
 
 
-deps = ComputeDeps()
+# Set up dependencies
+user = User(home_location="St. Louis, MO")
+flight_api = FlightAPI()
+deps = TravelDeps(user=user, flight_api=flight_api)
+
+# Agent calls both tools
 result = agent.run_sync(
-    'Train gpt-4 on large_dataset and process large_dataset with transform',
+    'Book a flight to Lisbon, Portugal and link my SkyWay Airlines account',
     deps=deps,
 )
 messages = result.all_messages()
@@ -422,38 +427,42 @@ messages = result.all_messages()
 assert isinstance(result.output, DeferredToolRequests)
 requests = result.output
 
-# Make approval decisions based on metadata
+# Make approval decision using metadata
 results = DeferredToolResults()
 for call in requests.approvals:
     metadata = requests.metadata.get(call.tool_call_id, {})
-    cost = metadata.get('estimated_cost_usd', 0)
+    cost = metadata.get('cost_usd', 0)
 
-    print(f'Approval needed: {call.tool_name}')
-    #> Approval needed: train_model
-    print(f'  Model: {metadata.get("model_type")}, Cost: ${cost}')
-    #>   Model: gpt-4, Cost: $50.0
+    print(f"Approval needed: {call.tool_name}")
+    #> Approval needed: book_flight
+    print(f"  {metadata['origin']} → {metadata['destination']}: ${cost}")
+    #>   St. Louis, MO → Lisbon, Portugal: $850
 
-    if cost < 100:
+    if cost < 1000:
         results.approvals[call.tool_call_id] = ToolApproved()
     else:
-        results.approvals[call.tool_call_id] = ToolDenied('Cost exceeds limit')
+        results.approvals[call.tool_call_id] = ToolDenied('Cost exceeds budget')
 
-# Process external jobs using metadata
+# Handle deferred calls using metadata
 for call in requests.calls:
     metadata = requests.metadata.get(call.tool_call_id, {})
-    job_id = metadata.get('job_id')
+    auth_url = metadata.get('auth_url')
 
-    print(f'External job: {job_id}')
-    #> External job: job_large_dataset_transform
+    print(f"Browser auth required: {auth_url}")
+    #> Browser auth required: https://example.com/auth/skyway-airlines
 
-    # In real code, poll job status and get result
-    results.calls[call.tool_call_id] = f'Completed {job_id}'
+    # In real code: open browser, wait for auth completion
+    # For demo, just mark as completed
+    results.calls[call.tool_call_id] = "Frequent flyer account linked"
 
-result = agent.run_sync(message_history=messages, deferred_tool_results=results, deps=deps)
+# Continue with results
+result = agent.run_sync(
+    message_history=messages,
+    deferred_tool_results=results,
+    deps=deps,
+)
 print(result.output)
-"""
-Model gpt-4 trained on large_dataset and dataset processing job job_large_dataset_transform completed
-"""
+#> Flight to Lisbon booked successfully and your SkyWay Airlines account is now linked.
 ```
 
 _(This example is complete, it can be run "as is")_

From 8e6ac24c96c7846c8ca314ccecc807a89033dcf4 Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 17:13:45 -0600
Subject: [PATCH 8/9] test: Add test support for deferred_tools_with_metadata
 example

- Add mock model responses for flight booking example
- Update documentation snapshots to include metadata field
- Follow existing pattern for deferred tool testing
---
 docs/deferred-tools.md | 18 +++++++++---------
 tests/test_examples.py | 24 ++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/docs/deferred-tools.md b/docs/deferred-tools.md
index bbcfe07ea1..5fa8c7d8b3 100644
--- a/docs/deferred-tools.md
+++ b/docs/deferred-tools.md
@@ -345,14 +345,14 @@ from pydantic_ai import (
 
 @dataclass
 class User:
-    home_location: str = "St. Louis, MO"
+    home_location: str = 'St. Louis, MO'
 
 
 class FlightAPI:
     COSTS = {
-        ("St. Louis, MO", "Lisbon, Portugal"): 850,
-        ("St. Louis, MO", "Santiago, Chile"): 1200,
-        ("St. Louis, MO", "Los Angeles, CA"): 300,
+        ('St. Louis, MO', 'Lisbon, Portugal'): 850,
+        ('St. Louis, MO', 'Santiago, Chile'): 1200,
+        ('St. Louis, MO', 'Los Angeles, CA'): 300,
     }
 
     def get_flight_cost(self, origin: str, destination: str) -> int:
@@ -394,7 +394,7 @@ def book_flight(ctx: RunContext[TravelDeps], destination: str) -> str:
             }
         )
 
-    return f"Flight booked to {destination}"
+    return f'Flight booked to {destination}'
 
 
 @agent.tool
@@ -413,7 +413,7 @@ def authenticate_with_airline(ctx: RunContext[TravelDeps], airline: str) -> str:
 
 
 # Set up dependencies
-user = User(home_location="St. Louis, MO")
+user = User(home_location='St. Louis, MO')
 flight_api = FlightAPI()
 deps = TravelDeps(user=user, flight_api=flight_api)
 
@@ -433,7 +433,7 @@ for call in requests.approvals:
     metadata = requests.metadata.get(call.tool_call_id, {})
     cost = metadata.get('cost_usd', 0)
 
-    print(f"Approval needed: {call.tool_name}")
+    print(f'Approval needed: {call.tool_name}')
     #> Approval needed: book_flight
     print(f"  {metadata['origin']} → {metadata['destination']}: ${cost}")
     #>   St. Louis, MO → Lisbon, Portugal: $850
@@ -448,12 +448,12 @@ for call in requests.calls:
     metadata = requests.metadata.get(call.tool_call_id, {})
     auth_url = metadata.get('auth_url')
 
-    print(f"Browser auth required: {auth_url}")
+    print(f'Browser auth required: {auth_url}')
     #> Browser auth required: https://example.com/auth/skyway-airlines
 
     # In real code: open browser, wait for auth completion
     # For demo, just mark as completed
-    results.calls[call.tool_call_id] = "Frequent flyer account linked"
+    results.calls[call.tool_call_id] = 'Frequent flyer account linked'
 
 # Continue with results
 result = agent.run_sync(
diff --git a/tests/test_examples.py b/tests/test_examples.py
index caa1eeba3c..ca2f848ccb 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -720,6 +720,21 @@ async def model_logic(  # noqa: C901
                     TextPart(content='The factorial of 15 is **1,307,674,368,000**.'),
                 ]
             )
+        elif m.content == 'Book a flight to Lisbon, Portugal and link my SkyWay Airlines account':
+            return ModelResponse(
+                parts=[
+                    ToolCallPart(
+                        tool_name='book_flight',
+                        args={'destination': 'Lisbon, Portugal'},
+                        tool_call_id='pyd_ai_tool_call_id_1',
+                    ),
+                    ToolCallPart(
+                        tool_name='authenticate_with_airline',
+                        args={'airline': 'SkyWay Airlines'},
+                        tool_call_id='pyd_ai_tool_call_id_2',
+                    ),
+                ]
+            )
 
     elif isinstance(m, ToolReturnPart) and m.tool_name == 'roulette_wheel':
         win = m.content == 'winner'
@@ -896,6 +911,15 @@ async def model_logic(  # noqa: C901
             )
         # If we don't have both results yet, just acknowledge the tool result
         return ModelResponse(parts=[TextPart(f'Received result from {m.tool_name}')])
+    elif isinstance(m, ToolReturnPart) and m.tool_name in ('book_flight', 'authenticate_with_airline'):
+        # After deferred tools complete, check if we have all results to provide final response
+        tool_names = {part.tool_name for msg in messages for part in msg.parts if isinstance(part, ToolReturnPart)}
+        if 'book_flight' in tool_names and 'authenticate_with_airline' in tool_names:
+            return ModelResponse(
+                parts=[TextPart('Flight to Lisbon booked successfully and your SkyWay Airlines account is now linked.')]
+            )
+        # If we don't have both results yet, just acknowledge the tool result
+        return ModelResponse(parts=[TextPart(f'Received result from {m.tool_name}')])
 
     if isinstance(m, ToolReturnPart):
         sys.stdout.write(str(debug.format(messages, info)))

From 09a38a2df36fc692ac5840c14ea129e1a92ead40 Mon Sep 17 00:00:00 2001
From: "Cody J. Hanson" <cody@codyjhanson.com>
Date: Fri, 7 Nov 2025 17:40:30 -0600
Subject: [PATCH 9/9] test: Complete deferred tool tests and remove dead branch

- Complete test_approval_required_without_metadata() by running agent
  with approval results to hit the tool implementation line
- Complete test_mixed_deferred_tools_with_metadata() by running agent
  with all deferred tool results to hit tool implementation lines
- Remove unreachable branch check in _populate_deferred_calls() - keys
  are always present in both dicts by construction

This achieves 100% coverage for the deferred tool metadata feature.
---
 pydantic_ai_slim/pydantic_ai/_agent_graph.py |  7 +++----
 tests/test_tools.py                          | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
index 9481491716..7285908123 100644
--- a/pydantic_ai_slim/pydantic_ai/_agent_graph.py
+++ b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -1046,10 +1046,9 @@ def _populate_deferred_calls(
     for k in sorted(deferred_calls_by_index):
         call = tool_calls[k]
         output_deferred_calls[deferred_calls_by_index[k]].append(call)
-        if k in deferred_metadata_by_index:
-            metadata = deferred_metadata_by_index[k]
-            if metadata is not None:
-                output_deferred_metadata[call.tool_call_id] = metadata
+        metadata = deferred_metadata_by_index[k]
+        if metadata is not None:
+            output_deferred_metadata[call.tool_call_id] = metadata
 
 
 async def _call_tool(
diff --git a/tests/test_tools.py b/tests/test_tools.py
index 9d0739d33b..a92e831873 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -1509,6 +1509,14 @@ def my_tool(ctx: RunContext[None], x: int) -> int:
     # Should have an empty metadata dict for this tool
     assert result.output.metadata.get('my_tool', {}) == {}
 
+    # Continue with approval
+    messages = result.all_messages()
+    result = agent.run_sync(
+        message_history=messages,
+        deferred_tool_results=DeferredToolResults(approvals={'my_tool': ToolApproved()}),
+    )
+    assert result.output == 'Done!'
+
 
 def test_mixed_deferred_tools_with_metadata():
     """Test multiple deferred tools with different metadata."""
@@ -1553,6 +1561,17 @@ def tool_c(ctx: RunContext[None], z: int) -> int:
     assert result.output.metadata['call_b'] == {'reason': 'Needs approval', 'level': 'manager'}
     assert result.output.metadata.get('call_c', {}) == {}
 
+    # Continue with results for all three tools
+    messages = result.all_messages()
+    result = agent.run_sync(
+        message_history=messages,
+        deferred_tool_results=DeferredToolResults(
+            calls={'call_a': 10, 'call_c': 30},
+            approvals={'call_b': ToolApproved()},
+        ),
+    )
+    assert result.output == 'Done!'
+
 
 def test_deferred_tool_with_output_type():
     class MyModel(BaseModel):