diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..d87dbe1f6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "python/evaluation/tau2-adk-harness/tau2-bench"] + path = python/evaluation/tau2-adk-harness/tau2-bench + url = https://github.com/sierra-research/tau2-bench.git diff --git a/python/evaluation/tau2-adk-harness/.gitignore b/python/evaluation/tau2-adk-harness/.gitignore new file mode 100644 index 000000000..ae657ce18 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/.gitignore @@ -0,0 +1,209 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Cursor +# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to +# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data +# refer to https://docs.cursor.com/context/ignore-files +.cursorignore +.cursorindexingignore + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +evaluation_logs/ diff --git a/python/evaluation/tau2-adk-harness/README.md b/python/evaluation/tau2-adk-harness/README.md new file mode 100644 index 000000000..ff0ae7716 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/README.md @@ -0,0 +1,242 @@ +# The Tau2-ADK Evaluation Harness: From "It Works" to "It's Proven" + +**The must-have evaluation framework for building production-ready ADK agents.** + +## The Challenge: How Do You *Really* Know Your Agent Is Ready for Production? + +Building an AI agent with Google's Agent Development Kit (ADK) is straightforward. Proving that it is reliable, safe, and effective is not. + +Manual, ad-hoc testing is often: +- **Subjective**: Based on a few "happy path" conversations. +- **Incomplete**: Misses crucial edge cases and adversarial user behavior. +- **Not Repeatable**: Lacks a consistent way to measure if a change made the agent better or worse. + +Going to production without rigorous, objective evaluation is a significant risk. How do you prevent regressions? How do you prove the value of upgrading your model? How do you know your agent will follow complex business rules when a user tries to trick it? + +## The Solution: A Professional Harness for a World-Class Benchmark + +This project provides the critical bridge between your **production-grade ADK agent** and the **sophisticated [`tau2-bench`](https://github.com/sierra-research/tau2-bench) benchmark**. + +- **What It Is**: An evaluation harness that allows you to test agents built with Google's ADK against a suite of complex, stateful, and realistic customer service scenarios. +- **How It Works**: It acts as an intelligent adapter, translating your agent's native, event-driven architecture into the format expected by the `tau2-bench` evaluation engine. + +In short, this project lets you take your agent out of the lab and put it through a rigorous "flight simulator" to prove it's ready for the real world. + +## Why Is This Critical for Your ADK Project? + +Adopting this framework provides foundational benefits that directly impact quality and reduce risk. + +#### 1. Test Your *Actual* Production Code +Vanilla `tau2-bench` and the Google ADK use different agent architectures. Without this harness, you would have to build a simplified, "test-only" version of your agent, which wouldn't accurately reflect its real-world behavior. + +**The harness allows you to evaluate the exact `adk.Agent` object you will deploy to production.** You are testing your real code, your real prompts, and your real logic in a controlled environment. + +#### 2. Evaluate Core Reasoning, Not Just Tool Execution +A working tool is only a small part of a successful agent. This framework tests the agent's "brain." It answers the most important questions: +- **Instruction Following**: Does the agent correctly adhere to a multi-page policy document, even when a user pushes against the rules? +- **Tool Selection & Sequencing**: In a messy, multi-turn conversation, can it identify the correct tool, and then the *next* correct tool, to solve a multi-step problem? +- **Parameter Extraction**: Does it accurately extract the right arguments (e.g., `product_id`, `user_name`) from ambiguous natural language? +- **Result Interpretation & State Tracking**: Can it understand a complex API response (JSON, dictionary), integrate that information into its understanding of the conversation, and formulate a correct, helpful next step? + +#### 3. Leverage a Rich, Pre-Built Test Suite for Free +The `tau2-bench` tasks are a treasure trove of expertly crafted test cases. Instead of spending weeks creating your own, you get immediate access to dozens of scenarios, including: +- Users who lie or provide incorrect information. +- Users who change their minds or introduce new goals mid-conversation. +- Complex, multi-step tasks that require multiple tool calls. +- Scenarios designed to test specific policy violations. + +This is like having a professional QA team dedicated to finding your agent's weaknesses before your customers do. + +#### 4. Achieve Objective, Repeatable Benchmarking +Stop guessing if a change was an improvement. This harness provides a numerical reward score for every evaluation run. This enables you to: +- **Track Progress**: Get hard data showing your agent's performance is improving over time. +- **Prevent Regressions**: Add these evaluations to your CI/CD pipeline to catch regressions before they merge. +- **Compare Models**: Objectively measure the performance lift of switching from one LLM to another (e.g., Gemini 2.5 Flash vs. Gemini 2.5 Pro). + +The conversation shifts from "I think this prompt is better" to "This prompt improved performance on adversarial tasks by 15%." + +#### 5. Enable Automated Prompt and Model Optimization +Because the harness produces a single, objective score for a suite of tests, it becomes the engine for advanced MLOps workflows. You can move beyond manual A/B testing and build automated pipelines that can: +- Systematically test hundreds of prompt variations to find the optimal system instruction. +- Tune tool descriptions and agent strategies algorithmically. +- Automatically benchmark and select the most cost-effective model that meets your performance target. + +This harness provides the foundational measurement layer required to apply data-driven optimization techniques directly to your ADK agents. + +## How is this different from ADK's built-in `adk eval`? + +The built-in `adk eval` is an excellent tool for "golden set" testing, but it serves a different purpose. Think of it as your project's **unit test suite**, while this harness is the **standardized certification exam**. + +| Feature | ADK's `adk eval` (Unit Testing) | Tau2-ADK Harness (Performance Benchmarking) | +| :--- | :--- | :--- | +| **Purpose** | **Regression Testing.** Answers: "Does my agent still produce the *exact same output* for a known input?" | **Problem-Solving Benchmarking.** Answers: "How well can my agent *independently solve a complex problem* from scratch?" | +| **Method** | Compares a new run against a previously saved, "golden" conversation trajectory. | Compares the final outcome against an objective, ground-truth state (e.g., "Was the correct flight booked?"). | +| **Environment** | Assumes static tool behavior. Excellent for testing agent logic in isolation. | Interacts with a **stateful, simulated environment** (e.g., dynamic product databases, user profiles). | +| **Test Cases** | Requires developers to manually create and curate specific test conversations. | Leverages a **pre-existing, curated suite** of dozens of challenging tasks designed to find edge cases. | + +--- + +## FAQ: Answering the Tough Questions + +#### Q: "This is great, but our agent's tools are totally different from the ones in `tau2-bench`. How is this useful?" +**A:** This is by design. The harness is not for testing your tool's *implementation*, but your agent's *reasoning*. You use the `harness/tool_mapper.py` file to create a "translation layer" that maps your agent's tools to the benchmark's tools. This process itself is valuable, but the result is a test of your agent's ability to decide *when* to call a tool and with *what* arguments, which is a universal challenge. + +#### Q: "Why not just use the vanilla `tau2-bench` project directly?" +**A:** Because they use fundamentally different programming models. This harness treats `tau2-bench` as a **library of assets** (tasks, stateful environments, and scoring logic) rather than an execution engine. The ADK is asynchronous and event-driven, while `tau2-bench` expects a simpler, synchronous agent. This harness provides the essential, non-trivial code in `run_evaluation.py` that bridges this gap. Without it, you would have to build this complex adapter yourself. + +#### Q: "The harness seems to be just one small `tool_mapper.py` file. Is that it?" +**A:** `tool_mapper.py` is only the simple **configuration** file. The **engine** is `run_evaluation.py`, a substantial script that orchestrates the entire process: running the ADK agent, managing the async event stream, translating messages, handling the conversational loop, and calculating the final score. The simplicity of the mapper is a feature that makes the powerful engine easy to configure. + +#### Q: "This seems like a lot of work to set up. What's the ROI?" +**A:** The return on investment is **confidence and risk reduction**. It's the difference between hoping your agent works and having data that proves it handles a wide range of challenging scenarios correctly. The initial effort of creating the tool mapping or customizing the benchmark pays for itself by preventing production failures, accelerating development cycles, and providing clear, data-driven insights into your agent's quality. + +## Your Path to Confident Deployment + +1. **Start Small**: Begin by mapping your agent's tools to an existing `tau2-bench` domain (like `retail` or `airline`). This is a low-effort way to get an immediate signal on your agent's core reasoning. +2. **Go Pro**: As your agent matures, create a custom benchmark by forking `tau2-bench` and modifying the tools, database, and tasks to perfectly mirror your own environment. This gives you a permanent, high-fidelity asset for ensuring your agent's quality. + +Using this harness will fundamentally change how you develop agents. You will move faster, build with more confidence, and create a final product that is demonstrably more robust and reliable. + +--- + +## Technical Documentation & Quick Start + +This section provides the original technical details for setting up and running the harness. + +### How It Works + +The harness sits between the ADK agent and the Tau2 ecosystem, orchestrating the conversation and translating calls between the two frameworks. The diagram below illustrates the flow of information during a single turn where the agent decides to use a tool. + +![Flow Diagram](assets/flow.png) + +Here is a step-by-step walkthrough of what happens during this process: + +1. **The User Speaks:** The **Tau2 User Simulator**, following its instructions for the current task, generates an initial user utterance (e.g., "I need to find a flight from SFO to JFK tomorrow"). + +2. **Harness Forwards to ADK Agent:** The **Harness** receives the message, formats it into an ADK `Content` object, and passes it to your **ADK Agent** to process. + +3. **ADK Agent Decides to Use a Tool:** Your agent's LLM analyzes the user's request. Based on its instructions and the available tool definitions (e.g., `adk_find_flights`), it decides to call a tool. The ADK framework emits this decision as a `FunctionCall` event. + +4. **Harness Intercepts and Translates the Tool Call:** The **Harness** intercepts this `FunctionCall` event. Instead of executing the dummy ADK function, it passes the tool name and arguments to the internal **Tool Mapper**. The mapper translates them into the corresponding Tau2 tool name and argument format (e.g., `adk_find_flights` becomes `search_direct_flight`). + +5. **Harness Executes the *Real* Tool:** The harness executes the *translated* tool call against the stateful **Tau2 Environment**. + +6. **Tau2 Environment Returns a Result:** The Tau2 Environment processes the request (e.g., queries its internal flight database) and returns a raw result (e.g., a list of Pydantic `Flight` objects). + +7. **Harness Forwards the Result to the ADK Agent:** The **Harness** receives the raw result from the Tau2 Environment. It formats this result into an ADK `FunctionResponse` object, typically by serializing it to a dictionary or JSON string that the ADK agent's LLM can understand. This is sent back to the agent to inform its next step. + +8. **ADK Agent Formulates a Response:** Your **ADK Agent**'s LLM processes the tool result and generates a natural language response for the user (e.g., "I found three available flights for you..."). + +9. **Harness Delivers Response to the User:** The **Harness** captures this final text response and passes it back to the **Tau2 User Simulator**, completing the turn. The user simulator then evaluates the agent's response and generates its own reply, continuing the conversation. + +### Features + +**Key Features:** +* **Dynamic Agent Loading:** Point the harness to your ADK agent file and variable. +* **Runtime Policy Injection:** Automatically injects the Tau2 domain policy into your ADK agent's instructions for each task. +* **Extensible Tool Mapping:** A simple, centralized mapping system in `harness/tool_mapper.py` translates tool and argument names between the ADK and Tau2 frameworks. +* **Full Trajectory Capture:** Captures the complete conversation in Tau2's data format for reliable evaluation. +* **Sample Agent Included:** Comes with a working sample agent for the `airline` domain to get you started immediately. + +### Prerequisites + +1. **Python 3.8+** +2. A working installation of **Tau2 Bench**. This harness relies on its environment and evaluation libraries. Please follow the [Tau2 Bench installation guide](https://github.com/sierra-research/tau2-bench#installation) first. +3. **API Keys:** The harness and the underlying frameworks use LLMs. You must provide API keys in a `.env` file in the project's root directory. + + Create a file named `.env` and add your keys: + ```bash + # .env file + GOOGLE_API_KEY="your-google-api-key" + OPENAI_API_KEY="your-openai-api-key" + # ... and any other keys for your desired LLM provider + ``` + +### Installation + +1. Clone the repository: + ```bash + git clone + cd tau2-adk-harness + ``` + +2. Install the required Python packages: + ```bash + pip install -r requirements.txt + ``` + +### Quick Start + +You can run an evaluation on the included sample agent against the Tau2 `airline` domain. + +```bash +python run_evaluation.py \ + --domain airline \ + --adk_agent_path sample_adk_agents/airline/agent.py:root_agent \ + --user-llm gemini-2.5-flash \ + --num-tasks 1 +``` + +**Command Breakdown:** +* `--domain airline`: Specifies the Tau2 domain to run against. +* `--adk_agent_path ...`: Provides the path to the ADK agent file and the variable name of the agent instance (`file:variable`). +* `--user-llm ...`: The LLM to power the Tau2 user simulator. A powerful model like Gemini 2.5 Pro is recommended for realistic user behavior. +* `--num-tasks 1`: Runs only the first task for a quick test. + +### How to Use the Harness + +#### 1. Evaluating Your Own ADK Agent + +To evaluate your custom ADK agent, follow these steps: + +1. **Ensure your agent is defined in a Python file.** For example, `my_company/agents/booking_agent.py` might contain a variable `flight_booker_agent = Agent(...)`. + +2. **Map your agent's tools to Tau2's tools.** Open `harness/tool_mapper.py` and add or update the mapping for the domain you are targeting. For example, if your agent's tool for finding flights is named `search_for_flights`, you would map it to Tau2's `search_direct_flight` in the `airline` domain config. + +3. **Run the evaluation script,** pointing it to your agent: + ```bash + python run_evaluation.py \ + --domain airline \ + --adk_agent_path my_company/agents/booking_agent.py:flight_booker_agent \ + --user-llm gemini-2.5-flash + ``` + +#### 2. Adding Support for a New Tau2 Domain + +Extending the harness to support another Tau2 domain (e.g., `telecom`) is straightforward: + +1. **Inspect the Tau2 Domain's Tools:** Use the Tau2 CLI to see the available tools and their signatures for the new domain: + ```bash + tau2 domain telecom + ``` + This will open a ReDoc page with the domain's policy and tool API documentation. + +2. **Update the Tool Mapper:** Open `harness/tool_mapper.py` and add a new entry to the `DOMAIN_CONFIGS` dictionary. + + ```python + # harness/tool_mapper.py + + def map_telecom_arguments(adk_tool_name: str, adk_args: dict) -> dict: + # Add logic here if ADK argument names differ from Tau2's + # For example: map 'phoneNumber' to 'phone_number' + if 'phoneNumber' in adk_args: + adk_args['phone_number'] = adk_args.pop('phoneNumber') + return adk_args + + DOMAIN_CONFIGS = { + "airline": { + # ... existing airline config + }, + "telecom": { + "tool_map": { + # ADK Tool Name : Tau2 Tool Name + "adk_get_customer_by_phone": "get_customer_by_phone", + "adk_suspend_line": "suspend_line", + # ... other telecom tool mappings + }, + "arg_mapper": map_telecom_arguments + } + } + ``` + +3. **Create your ADK agent** with the tool signatures you defined (e.g., `adk_get_customer_by_phone`) and run the evaluation against the `telecom` domain. \ No newline at end of file diff --git a/python/evaluation/tau2-adk-harness/assets/flow.png b/python/evaluation/tau2-adk-harness/assets/flow.png new file mode 100644 index 000000000..b778da92a Binary files /dev/null and b/python/evaluation/tau2-adk-harness/assets/flow.png differ diff --git a/python/evaluation/tau2-adk-harness/harness/__init__.py b/python/evaluation/tau2-adk-harness/harness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/evaluation/tau2-adk-harness/harness/tool_mapper.py b/python/evaluation/tau2-adk-harness/harness/tool_mapper.py new file mode 100644 index 000000000..28c25a727 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/harness/tool_mapper.py @@ -0,0 +1,63 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def map_airline_arguments(adk_tool_name: str, adk_args: dict) -> dict: + """ + Translates arguments from ADK tool format to Tau2 tool format for the airline + domain. For our current example, the argument names match, so we can just return + them. If they were different, you would add mapping logic here. e.g., if adk_args + had 'from_city', you'd map it to 'origin'. + """ + return adk_args + + +# --- The Central Mapping Configuration --- +# To support a new domain, add a new entry here. +DOMAIN_CONFIGS = { + "airline": { + "tool_map": { + # ADK Tool Name : Tau2 Tool Name + "adk_find_flights": "search_direct_flight", + "adk_get_booking_details": "get_reservation_details", + "adk_cancel_reservation": "cancel_reservation", + "adk_transfer_to_human": "transfer_to_human_agents", + "adk_book_reservation": "book_reservation", + "adk_calculate": "calculate", + "adk_get_user_details": "get_user_details", + "adk_list_all_airports": "list_all_airports", + "adk_search_onestop_flight": "search_onestop_flight", + "adk_send_certificate": "send_certificate", + "adk_update_reservation_baggages": "update_reservation_baggages", + "adk_update_reservation_flights": "update_reservation_flights", + "adk_update_reservation_passengers": "update_reservation_passengers", + "adk_get_flight_status": "get_flight_status", + }, + "arg_mapper": map_airline_arguments, + }, + # "telecom": { ... mappings for telecom would go here ... } +} + + +def get_tool_mapping(domain: str) -> dict: + """ + Returns the tool and argument mapping configuration for a given domain. + """ + if domain in DOMAIN_CONFIGS: + return DOMAIN_CONFIGS[domain] + else: + raise ValueError( + f"No tool mapping is configured for domain: '{domain}'. Please add it" + f"to harness/tool_mapper.py." + ) diff --git a/python/evaluation/tau2-adk-harness/requirements.txt b/python/evaluation/tau2-adk-harness/requirements.txt new file mode 100644 index 000000000..5bdf9fa37 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/requirements.txt @@ -0,0 +1,163 @@ +absolufy-imports==0.3.1 +addict==2.4.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +alembic==1.16.5 +annotated-types==0.7.0 +anyio==4.11.0 +appdirs==1.4.4 +attrs==25.3.0 +Authlib==1.6.4 +backoff==2.2.1 +cachetools==6.2.0 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +contourpy==1.3.3 +cryptography==46.0.2 +cycler==0.12.1 +deepdiff==8.6.1 +distro==1.9.0 +docstring_parser==0.17.0 +fastapi==0.118.0 +fastuuid==0.13.5 +filelock==3.19.1 +fonttools==4.60.1 +frozenlist==1.7.0 +fs==2.4.16 +fsspec==2025.9.0 +google-adk==1.15.1 +google-api-core==2.25.1 +google-api-python-client==2.183.0 +google-auth==2.41.1 +google-auth-httplib2==0.2.0 +google-cloud-aiplatform==1.118.0 +google-cloud-appengine-logging==1.6.2 +google-cloud-audit-log==0.3.2 +google-cloud-bigquery==3.38.0 +google-cloud-bigtable==2.32.0 +google-cloud-core==2.4.3 +google-cloud-logging==3.12.1 +google-cloud-monitoring==2.27.2 +google-cloud-resource-manager==1.14.2 +google-cloud-secret-manager==2.24.0 +google-cloud-spanner==3.58.0 +google-cloud-speech==2.33.0 +google-cloud-storage==2.19.0 +google-cloud-trace==1.16.2 +google-crc32c==1.7.1 +google-genai==1.38.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.70.0 +graphviz==0.21 +grpc-google-iam-v1==0.14.2 +grpc-interceptor==0.15.4 +grpcio==1.75.1 +grpcio-status==1.75.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httplib2==0.31.0 +httpx==0.28.1 +httpx-sse==0.4.1 +huggingface-hub==0.35.3 +idna==3.10 +importlib_metadata==8.7.0 +iniconfig==2.1.0 +Jinja2==3.1.6 +jiter==0.11.0 +joblib==1.5.2 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +kiwisolver==1.4.9 +langfuse==3.5.2 +litellm==1.77.5 +loguru==0.7.3 +Mako==1.3.10 +markdown-it-py==4.0.0 +MarkupSafe==3.0.3 +matplotlib==3.10.6 +mcp==1.15.0 +mdurl==0.1.2 +multidict==6.6.4 +narwhals==2.6.0 +numpy==2.3.3 +openai==2.0.0 +opentelemetry-api==1.37.0 +opentelemetry-exporter-gcp-logging==1.9.0a0 +opentelemetry-exporter-gcp-monitoring==1.9.0a0 +opentelemetry-exporter-gcp-trace==1.9.0 +opentelemetry-exporter-otlp-proto-common==1.37.0 +opentelemetry-exporter-otlp-proto-http==1.37.0 +opentelemetry-proto==1.37.0 +opentelemetry-resourcedetector-gcp==1.9.0a0 +opentelemetry-sdk==1.37.0 +opentelemetry-semantic-conventions==0.58b0 +orderly-set==5.5.0 +packaging==25.0 +pandas==2.3.3 +pillow==11.3.0 +plotly==6.3.0 +pluggy==1.6.0 +propcache==0.3.2 +proto-plus==1.26.1 +protobuf==6.32.1 +psutil==7.1.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.23 +pydantic==2.11.9 +pydantic-argparse==0.10.0 +pydantic-settings==2.11.0 +pydantic_core==2.33.2 +Pygments==2.19.2 +pyparsing==3.2.5 +pytest==8.4.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.1 +python-multipart==0.0.20 +pytz==2025.2 +PyYAML==6.0.3 +redis==6.4.0 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.1.0 +rpds-py==0.27.1 +rsa==4.9.1 +ruff==0.13.2 +scikit-learn==1.7.2 +scipy==1.16.2 +seaborn==0.13.2 +setuptools==80.9.0 +shapely==2.1.2 +six==1.17.0 +sniffio==1.3.1 +SQLAlchemy==2.0.43 +sqlalchemy-spanner==1.16.0 +sqlparse==0.5.3 +sse-starlette==3.0.2 +starlette==0.48.0 +tabulate==0.9.0 +-e git+https://github.com/sierra-research/tau2-bench.git@5ba9e3e56db57c5e4114bf7f901291f09b2c5619#egg=tau2&subdirectory=../../python/evaluation/tau2-adk-harness/tau2-bench +tenacity==9.1.2 +threadpoolctl==3.6.0 +tiktoken==0.11.0 +tokenizers==0.22.1 +toml==0.10.2 +tqdm==4.67.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +tzdata==2025.2 +tzlocal==5.3.1 +uritemplate==4.2.0 +urllib3==2.5.0 +uvicorn==0.37.0 +watchdog==6.0.0 +websockets==15.0.1 +wrapt==1.17.3 +yarl==1.20.1 +zipp==3.23.0 diff --git a/python/evaluation/tau2-adk-harness/run_evaluation.py b/python/evaluation/tau2-adk-harness/run_evaluation.py new file mode 100644 index 000000000..df6554d35 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/run_evaluation.py @@ -0,0 +1,519 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script runs an evaluation harness for ADK agents against Tau2-Bench tasks. + +It orchestrates the interaction between an ADK agent and a Tau2-Bench environment, +simulating a user conversation and evaluating the agent's performance based on +the Tau2-Bench reward system. + +Key functionalities include: +- Loading an ADK agent dynamically from a specified path. +- Initializing a Tau2-Bench environment for a given domain and task. +- Injecting Tau2-Bench domain policies into the ADK agent's instructions. +- Simulating user interactions using a specified LLM. +- Mapping ADK tool calls to Tau2-Bench tool calls and executing them. +- Logging detailed trajectories and results for each task. +- Generating a summary of the evaluation run, including average reward. +""" + + +import asyncio +import argparse +import importlib +import json +import sys +import os +import csv +import warnings +from pathlib import Path +from contextlib import redirect_stderr +from dotenv import load_dotenv +from copy import deepcopy +from datetime import datetime +from tqdm.asyncio import tqdm +from loguru import logger as loguru_logger + +# Disable the default loguru handler to clean up console output from tau2-bench +# This must be done before importing any tau2 modules +loguru_logger.remove() +loguru_logger.add(sys.stderr, level="WARNING") + +# Suppress ResourceWarning, which can be triggered by unclosed aiohttp client sessions +warnings.filterwarnings("ignore", category=ResourceWarning) + +# --- ADK Imports --- +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.adk.events import Event +from google.genai.types import Content, Part, FunctionResponse, FunctionCall + +# --- Tau2-Bench Imports --- +from tau2.run import get_tasks +from tau2.registry import registry +from tau2.evaluator.evaluator_env import EnvironmentEvaluator +from tau2.data_model.simulation import SimulationRun, TerminationReason +from tau2.data_model.message import UserMessage, AssistantMessage, ToolCall, ToolMessage +from tau2.user.user_simulator import UserSimulator + +# --- Harness Imports --- +from harness.tool_mapper import get_tool_mapping + + +class FileLogger: + """A simple logger to write to a file and optionally to stdout.""" + + def __init__(self, log_path, to_stdout=True): + self.terminal = sys.stdout + self.log_file = open(log_path, "w", encoding="utf-8") + self.to_stdout = to_stdout + + def log(self, message): + """Log a message to the configured outputs.""" + if self.to_stdout: + # This is now only used for high-level status in main + print(message, file=self.terminal, flush=True) + + # Always write to the task-specific log file + print(message, file=self.log_file, flush=True) + + def __del__(self): + if self.log_file: + self.log_file.close() + + def task_start(self, task_id): + """Logs the start of a task.""" + self.log(f"--- Running Task: {task_id} ---") + + def info(self, message): + """Logs an informational message.""" + self.log(f"\n[INFO] {message}") + + def user_simulator(self, message, is_stop=False): + """Logs a message from the user simulator.""" + stop_signal = " (STOP SIGNAL)" if is_stop else "" + self.log(f"\n[USER SIMULATOR]: {message}{stop_signal}") + + def turn_start(self, turn_number): + """Logs the start of a new turn.""" + self.log(f"\n>>> Turn {turn_number}: ADK Agent processing...") + + def agent_to_harness_tool_call(self, tool_name, args): + """Logs a tool call from the ADK agent to the harness.""" + self.log(f" [ADK AGENT -> Harness]: Tool Call: {tool_name}({args})") + + def harness_to_env(self, tool_name): + """Logs a tool execution in the Tau2 environment.""" + self.log(f" [Harness -> Tau2 Env]: Executed {tool_name}.") + + def agent_to_user(self, message): + """Logs a text response from the ADK agent to the user.""" + self.log(f" [ADK AGENT -> User]: {message}") + + def evaluation_result(self, task_id, reward, db_check): + """Logs the final evaluation result for a task.""" + self.log("\n--- EVALUATION RESULT ---") + self.log(f"✅ Task: {task_id}") + self.log(f"🏆 Reward: {reward:.2f}") + if db_check: + self.log(f"🗃️ DB Match: {db_check.db_match}") + self.log("----------------------------\n") + + +def _find_tool_call_in_events(events: list) -> FunctionCall | None: + for event in events: + if event.content and event.content.parts: + for part in event.content.parts: + if part.function_call: + return part.function_call + return None + + +def _get_final_text_from_events(events: list) -> str | None: + final_text = "" + for event in events: + if ( + event.is_final_response() + and event.content + and event.content.parts + and event.content.parts[0].text + ): + final_text += event.content.parts[0].text + return final_text if final_text else None + + +async def run_evaluation_for_task( + domain: str, + task, + adk_agent, + user_llm: str, + run_path: Path, + max_turns: int, +): + """Orchestrates the evaluation of a single task, logging its detailed output to its + own file.""" + task_path = run_path / "trajectories" / f"task_{task.id}" + os.makedirs(task_path, exist_ok=True) + + task_logger = FileLogger(task_path / "console.log", to_stdout=False) + + task_logger.task_start(task.id) + + env_constructor = registry.get_env_constructor(domain) + tau2_env = env_constructor() + if task.initial_state: + tau2_env.set_state( + initialization_data=task.initial_state.initialization_data, + initialization_actions=task.initial_state.initialization_actions, + message_history=task.initial_state.message_history or [], + ) + + domain_policy = tau2_env.get_policy() + adk_agent_with_policy = deepcopy(adk_agent) + original_instruction = adk_agent_with_policy.instruction + adk_agent_with_policy.instruction = ( + "You must strictly follow the policies provided below.\n\n" + "\n" + f"{domain_policy}\n" + "\n\n" + "--- Your Original Instructions ---\n" + f"{original_instruction}" + ) + task_logger.info( + "Injected Tau2 domain policy into ADK agent's instructions for this run." + ) + + adk_session_service = InMemorySessionService() + adk_runner = Runner( + agent=adk_agent_with_policy, + app_name="adk_eval_harness", + session_service=adk_session_service, + ) + adk_session = await adk_session_service.create_session( + app_name="adk_eval_harness", user_id="eval_user" + ) + + user_simulator = UserSimulator(instructions=str(task.user_scenario), llm=user_llm) + user_sim_state = user_simulator.get_init_state() + + tau2_trajectory = [] + + initial_assistant_msg = AssistantMessage( + role="assistant", content="Hello! How can I help you today?" + ) + user_response_msg, user_sim_state = user_simulator.generate_next_message( + initial_assistant_msg, user_sim_state + ) + + task_logger.user_simulator(user_response_msg.content) + tau2_trajectory.append(user_response_msg) + current_adk_message = Content( + role="user", parts=[Part(text=user_response_msg.content)] + ) + + # Main interaction loop: ADK Agent <-> User Simulator + for turn in range(max_turns): + task_logger.turn_start(turn + 1) + + # Run the ADK agent for one turn with the user's message + adk_events = [ + event + async for event in adk_runner.run_async( + session_id=adk_session.id, + user_id="eval_user", + new_message=current_adk_message, + ) + ] + + # Check if the agent responded with a tool call or a text message + adk_tool_call = _find_tool_call_in_events(adk_events) + + if adk_tool_call: + # Agent wants to use a tool. + # Map the ADK tool call to the corresponding Tau2 tool, execute it, + # and feed the result back to the agent in the next turn. + tool_map_config = get_tool_mapping(domain) + adk_tool_name = adk_tool_call.name + adk_args = dict(adk_tool_call.args) + adk_tool_call_id = adk_tool_call.id or f"adk_tool_call_{turn}" + + task_logger.agent_to_harness_tool_call(adk_tool_name, adk_args) + + tau2_tool_name = tool_map_config["tool_map"].get(adk_tool_name) + tau2_args = tool_map_config["arg_mapper"](adk_tool_name, adk_args) + + tau2_trajectory.append( + AssistantMessage( + role="assistant", + tool_calls=[ + ToolCall( + id=adk_tool_call_id, + name=tau2_tool_name, + arguments=tau2_args, + ) + ], + ) + ) + + tool_result = tau2_env.use_tool(tool_name=tau2_tool_name, **tau2_args) + task_logger.harness_to_env(tau2_tool_name) + + if hasattr(tool_result, "model_dump"): + tool_result_for_adk = tool_result.model_dump() + tool_result_for_eval = tool_result.model_dump() + elif isinstance(tool_result, dict): + tool_result_for_adk = tool_result + tool_result_for_eval = tool_result + else: + tool_result_for_adk = {"result": tool_result} + tool_result_for_eval = tool_result + + tau2_trajectory.append( + ToolMessage( + id=adk_tool_call_id, + role="tool", + content=json.dumps(tool_result_for_eval), + requestor="assistant", + ) + ) + + current_adk_message = Content( + role="user", + parts=[ + Part( + function_response=FunctionResponse( + id=adk_tool_call_id, + name=adk_tool_name, + response=tool_result_for_adk, + ) + ) + ], + ) + continue + + else: + # Agent responded with a text message. + # Get the user simulator's response and check if the conversation should end. + final_text = _get_final_text_from_events(adk_events) + if not final_text: + final_text = "(Agent produced no text response)" + + task_logger.agent_to_user(final_text) + + agent_response_msg = AssistantMessage(role="assistant", content=final_text) + tau2_trajectory.append(agent_response_msg) + + user_response_msg, user_sim_state = user_simulator.generate_next_message( + agent_response_msg, user_sim_state + ) + + # The user simulator signals the end of the conversation + if UserSimulator.is_stop(user_response_msg): + task_logger.user_simulator(user_response_msg.content, is_stop=True) + tau2_trajectory.append(user_response_msg) + break + + task_logger.user_simulator(user_response_msg.content) + tau2_trajectory.append(user_response_msg) + current_adk_message = Content( + role="user", parts=[Part(text=user_response_msg.content)] + ) + + dummy_sim_run = SimulationRun( + id=f"{run_path.name}_task_{task.id}", + task_id=task.id, + start_time="", + end_time="", + duration=0, + termination_reason=TerminationReason.USER_STOP, + messages=tau2_trajectory, + ) + + reward_info = EnvironmentEvaluator.calculate_reward( + environment_constructor=env_constructor, + task=task, + full_trajectory=dummy_sim_run.messages, + ) + + task_logger.evaluation_result(task.id, reward_info.reward, reward_info.db_check) + + # Save detailed task files + traj_path = task_path / "trajectory.json" + with open(traj_path, "w", encoding="utf-8") as f: + json.dump(dummy_sim_run.model_dump(mode="json"), f, indent=4) + + result_path = task_path / "result.json" + with open(result_path, "w", encoding="utf-8") as f: + json.dump(reward_info.model_dump(mode="json"), f, indent=4) + + return { + "task_id": task.id, + "reward": reward_info.reward, + "passed": reward_info.reward == 1.0, + "trajectory_file": str(traj_path), + "result_file": str(result_path), + "console_log_file": str(task_logger.log_file.name), + } + + +def write_summary_files(run_path: Path, all_task_results: list, args, adk_agent): + """Writes the run_summary.json and results.csv files.""" + # Write results.csv + csv_path = run_path / "results.csv" + with open(csv_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=["task_id", "reward", "passed"]) + writer.writeheader() + for result in all_task_results: + writer.writerow( + { + "task_id": result["task_id"], + "reward": result["reward"], + "passed": result["passed"], + } + ) + + # Write run_summary.json + summary_path = run_path / "run_summary.json" + total_reward = sum(r["reward"] for r in all_task_results) + avg_reward = total_reward / len(all_task_results) if all_task_results else 0 + + summary_data = { + "run_id": run_path.name, + "domain": args.domain, + "num_tasks_run": len(all_task_results), + "agent_config": { + "path": args.adk_agent_path, + "model": adk_agent.model, + "description": adk_agent.description, + }, + "user_llm": args.user_llm, + "average_reward": avg_reward, + "tasks": all_task_results, + } + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary_data, f, indent=4) + + +async def main(args): + # --- Setup Logging and Directories --- + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + run_id = f"run_{run_timestamp}_{args.domain}" + run_path = Path("evaluation_logs") / run_id + os.makedirs(run_path / "trajectories", exist_ok=True) + + print(f"Starting evaluation run: {run_id}") + print(f"Saving results to: {run_path.resolve()}") + + # --- Load ADK Agent --- + agent_path_str = args.adk_agent_path + try: + file_path_str, agent_variable_name = agent_path_str.split(":") + except ValueError: + raise ValueError( + f"Invalid --adk_agent_path format." + f"Expected 'path/to/agent.py:variable_name', but got '{agent_path_str}'" + ) + + agent_file_path = Path(file_path_str).resolve() + if not agent_file_path.is_file(): + raise FileNotFoundError(f"Agent file not found at: {agent_file_path}") + + agent_dir = agent_file_path.parent + sys.path.insert(0, str(agent_dir)) + + try: + module_name = agent_file_path.stem + agent_module = importlib.import_module(module_name) + adk_agent = getattr(agent_module, agent_variable_name) + except (ImportError, AttributeError) as e: + raise ImportError( + f"Could not import agent '{agent_variable_name}'" + f"from '{agent_file_path}'. Error: {e}" + ) + finally: + sys.path.pop(0) + + # --- Run Evaluation Tasks --- + tasks = get_tasks(args.domain, num_tasks=args.num_tasks) + + task_coroutines = [ + run_evaluation_for_task( + args.domain, task, adk_agent, args.user_llm, run_path, args.max_turns + ) + for task in tasks + ] + all_task_results = [] + + print(f"\nRunning {len(tasks)} tasks in parallel...") + # Temporarily redirect stderr to /dev/null to suppress aiohttp warnings and keep the + # progress bar clean + with open(os.devnull, "w") as devnull: + with redirect_stderr(devnull): + for future in tqdm.as_completed( + task_coroutines, + total=len(tasks), + desc=f"Evaluating {args.domain} domain", + file=sys.stdout, + ): + try: + result = await future + all_task_results.append(result) + except Exception as e: + # Print exceptions to stdout to ensure they are not silenced + print(f"\nA task failed with an exception: {e}\n") + + # --- Write Summary Files --- + if all_task_results: + write_summary_files(run_path, all_task_results, args, adk_agent) + + print(f"\nEvaluation run {run_id} finished.") + print(f"Results saved in: {run_path.resolve()}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run Conversational ADK Agent Evaluation Harness" + ) + parser.add_argument( + "--domain", type=str, required=True, help="Tau2-Bench domain to evaluate" + ) + parser.add_argument( + "--num-tasks", + type=int, + default=None, + help="Number of tasks to run (default: all).", + ) + parser.add_argument( + "--adk_agent_path", + type=str, + required=True, + help="Path to ADK agent. e.g. " + "'sample_adk_agents/airline/agent.py:variable_name'", + ) + parser.add_argument( + "--user-llm", type=str, required=True, help="LLM to use for the user simulator." + ) + parser.add_argument( + "--max-turns", + type=int, + default=15, + help="Maximum number of turns per task (default: 15).", + ) + + args = parser.parse_args() + + load_dotenv(dotenv_path=Path(__file__).parent / ".env") + + asyncio.run(main(args)) diff --git a/python/evaluation/tau2-adk-harness/sample_adk_agents/airline/__init__.py b/python/evaluation/tau2-adk-harness/sample_adk_agents/airline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/evaluation/tau2-adk-harness/sample_adk_agents/airline/agent.py b/python/evaluation/tau2-adk-harness/sample_adk_agents/airline/agent.py new file mode 100644 index 000000000..4b5511ca9 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/sample_adk_agents/airline/agent.py @@ -0,0 +1,250 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from google.adk.agents import Agent +from typing import List, Dict, Any + +# These are the ADK-native tool definitions. +# Their signatures and docstrings are what the ADK agent's LLM will see. +# The code inside these functions will NEVER be executed by the harness. + + +def adk_find_flights(origin: str, destination: str, date: str) -> List[dict]: + """ + Searches for available direct flights between an origin and a destination on a + specific date. Args: + origin: The three-letter IATA code for the origin airport (e.g., 'SFO'). + destination: The three-letter IATA code for the destination airport (e.g., + 'JFK'). date: The desired flight date in 'YYYY-MM-DD' format. + Returns: + A list of available flights, each represented as a dictionary. + """ + pass # The harness executes the real tau2 tool. + + +def adk_get_booking_details(reservation_id: str) -> dict: + """ + Retrieves the full details for a specific flight reservation using its ID. + Args: + reservation_id: The unique identifier for the reservation (e.g., '4WQ150'). + Returns: + A dictionary containing the reservation details. + """ + pass + + +def adk_cancel_reservation(reservation_id: str) -> dict: + """ + Cancels an entire flight reservation using its unique ID. + Args: + reservation_id: The unique identifier for the reservation to be cancelled. + Returns: + A dictionary confirming the cancellation status. + """ + pass + + +def adk_transfer_to_human(summary: str) -> dict: + """ + Transfers the user to a human agent when a request cannot be handled by the + available tools or policy. Args: + summary: A brief summary of the user's issue for the human agent. + Returns: + A dictionary confirming the transfer. + """ + pass + + +def adk_book_reservation( + user_id: str, + origin: str, + destination: str, + flight_type: str, + cabin: str, + flights: List[Dict[str, Any]], + passengers: List[Dict[str, Any]], + payment_methods: List[Dict[str, Any]], + total_baggages: int, + nonfree_baggages: int, + insurance: str, +) -> dict: + """ + Books a flight reservation with all necessary details. + Args: + user_id: The ID of the user. + origin: The origin city IATA code. + destination: The destination city IATA code. + flight_type: Type of flight, e.g., 'one_way' or 'round_trip'. + cabin: The cabin class, e.g., 'economy'. + flights: List of flight details. + passengers: List of passenger details. + payment_methods: List of payment methods to use. + total_baggages: Total number of bags. + nonfree_baggages: Number of non-free bags. + insurance: Whether to include insurance, 'yes' or 'no'. + Returns: + A dictionary confirming the new reservation. + """ + pass + + +def adk_calculate(expression: str) -> str: + """ + Calculates the result of a mathematical expression. + Args: + expression: The mathematical expression to calculate, e.g., '250 * 3 + 50'. + Returns: + The result of the calculation. + """ + pass + + +def adk_get_user_details(user_id: str) -> dict: + """ + Gets the details of a user, including their reservations. + Args: + user_id: The user ID. + Returns: + A dictionary with user details. + """ + pass + + +def adk_list_all_airports() -> List[dict]: + """ + Returns a list of all available airports. + Returns: + A list of airport dictionaries. + """ + pass + + +def adk_search_onestop_flight(origin: str, destination: str, date: str) -> List[dict]: + """ + Searches for one-stop flights between two cities on a specific date. + Args: + origin: The origin city airport IATA code. + destination: The destination city airport IATA code. + date: The date of the flight in 'YYYY-MM-DD' format. + Returns: + A list of flight options, where each option is a pair of flights. + """ + pass + + +def adk_send_certificate(user_id: str, amount: int) -> str: + """ + Sends a certificate of a specified amount to a user. + Args: + user_id: The user ID to send the certificate to. + amount: The amount of the certificate. + Returns: + A string confirming the certificate was sent. + """ + pass + + +def adk_update_reservation_baggages( + reservation_id: str, total_baggages: int, nonfree_baggages: int, payment_id: str +) -> dict: + """ + Updates the baggage information for a reservation. + Args: + reservation_id: The ID of the reservation to update. + total_baggages: The new total number of bags. + nonfree_baggages: The new number of non-free bags. + payment_id: The ID of the payment method to use for any charges. + Returns: + The updated reservation details. + """ + pass + + +def adk_update_reservation_flights( + reservation_id: str, cabin: str, flights: List[Dict[str, Any]], payment_id: str +) -> dict: + """ + Updates the flight information for a reservation. + Args: + reservation_id: The ID of the reservation to update. + cabin: The new cabin class for the reservation. + flights: The new list of flights for the entire reservation. + payment_id: The ID of the payment method to use for any charges or refunds. + Returns: + The updated reservation details. + """ + pass + + +def adk_update_reservation_passengers( + reservation_id: str, passengers: List[Dict[str, Any]] +) -> dict: + """ + Updates the passenger information for a reservation. + Args: + reservation_id: The ID of the reservation to update. + passengers: The new list of passengers. + Returns: + The updated reservation details. + """ + pass + + +def adk_get_flight_status(flight_number: str, date: str) -> str: + """ + Gets the status of a specific flight on a specific date. + Args: + flight_number: The flight number. + date: The date of the flight. + Returns: + The status of the flight (e.g., 'on time', 'delayed'). + """ + pass + + +# This is the agent we will evaluate. +root_agent = Agent( + name="adk_airline_agent", + model="gemini-2.5-flash", + description="An ADK agent for booking, finding, and managing flight reservations.", + instruction=( + "You are a task-oriented airline assistant. Your ONLY goal is to use the " + "provided tools to fulfill the user's request. " + "You MUST call a tool in your first turn if the user's request contains enough " + "information to do so. " + "Analyze the user's request and immediately call the appropriate tool to find, " + "get details for, or cancel a reservation. " + "NOTE: If the user wants to cancel you must first check if all criteria for " + "cancellation are met! " + "In particular check whether the flight was made within 24 hours. This is " + "important!" + ), + tools=[ + adk_find_flights, + adk_get_booking_details, + adk_cancel_reservation, + adk_transfer_to_human, + adk_book_reservation, + adk_calculate, + adk_get_user_details, + adk_list_all_airports, + adk_search_onestop_flight, + adk_send_certificate, + adk_update_reservation_baggages, + adk_update_reservation_flights, + adk_update_reservation_passengers, + adk_get_flight_status, + ], +) diff --git a/python/evaluation/tau2-adk-harness/tau2-bench b/python/evaluation/tau2-adk-harness/tau2-bench new file mode 160000 index 000000000..db6ce6ad3 --- /dev/null +++ b/python/evaluation/tau2-adk-harness/tau2-bench @@ -0,0 +1 @@ +Subproject commit db6ce6ad3920e7bf8e193c1eeeb8c0c09e8e654c