Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[New Feature][WIP]Init EBBenchmark #304

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions erniebot-agent/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .build_agent import AgentBuilder
from .build_tasks import TaskBuilder
55 changes: 55 additions & 0 deletions erniebot-agent/benchmark/build_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
from typing import Any, Dict

os.environ["EB_AGENT_ACCESS_TOKEN"] = ""
os.environ["EB_AGENT_LOGGING_LEVEL"] = "info"

from benchmark.schema import AgentArgs
from benchmark.utils import relative_import


class AgentBuilder(object):
"""The builder of the agent."""

def __init__(self, args: AgentArgs):
self.args = args
self.config_check()

self.model = self._build_model(args["model"], args["root_path"]["model"])
self.memory = self._build_memory(args["memory"], args["root_path"]["memory"])
self.tools = self._build_toolset(args["tools"], args["root_path"]["tools"]) if args["tools"] else []
self.agent = self._build_agent(args["agent"], args["root_path"]["agent"])

def config_check(self):
root_path_include = ["model", "memory", "tools", "agent"]
for module_name in root_path_include:
if module_name not in self.args["root_path"]:
raise ValueError(f"{module_name} module path is not specified in the root path.")

if module_name not in self.args:
raise ValueError(f"{module_name} config is not specified in the config file.")

def _build_model(self, model_args: Dict[str, Any], root_path):
"""Build the model."""
model = relative_import(root_path, model_args["name"])
return model(**model_args["kwargs"])

def _build_memory(self, memory_args: Dict[str, Any], root_path):
"""Build the memory."""
memory = relative_import(root_path, memory_args["name"])
return memory(**memory_args["kwargs"])

def _build_toolset(self, tools: Dict[str, Any], root_path):
"""Build the toolset."""
# NOTE: Tool initialization does not need params.
toolset = []
for tool_arg in tools:
tool = relative_import(root_path, tool_arg["name"])
toolset.append(tool(**tool_arg["kwargs"]))
return toolset

def _build_agent(self, agent_args, root_path):
"""Build the agent."""
# NOTE: Other way of passing in params is not included.
agent = relative_import(root_path, agent_args["name"])
return agent(llm=self.model, memory=self.memory, tools=self.tools, **agent_args["kwargs"])
22 changes: 22 additions & 0 deletions erniebot-agent/benchmark/build_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Any, Dict

from benchmark.schema import Task


class TaskBuilder(object):
"""The builder of the task."""

def __init__(self, task_config: Dict[str, Any]):
self.task_config = task_config

def build_task(self) -> Task:
"""Build the task."""
return Task(
task_name=self.task_config["task_name"],
task_id=self.task_config["task_id"],
task_weight=self.task_config["task_weight"],
prompt=self.task_config["prompt"],
answer=self.task_config["answer"],
eval=self.task_config["eval"],
report=None,
)
20 changes: 20 additions & 0 deletions erniebot-agent/benchmark/configs/agents/default_agent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
agent:
name: 'FunctionAgent'
kwargs: {}

model:
name: 'ERNIEBot'
kwargs:
model: 'ernie-3.5'

memory:
name: 'WholeMemory'
kwargs: {}

tools: []

root_path:
agent: 'erniebot_agent.agents'
memory: 'erniebot_agent.memory'
tools: 'erniebot_agent.tools'
model: 'erniebot_agent.chat_models'
541 changes: 541 additions & 0 deletions erniebot-agent/benchmark/data/instruction_following_input_data.jsonl

Large diffs are not rendered by default.

41 changes: 41 additions & 0 deletions erniebot-agent/benchmark/data/transfer_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import dataclasses
import json
from typing import Dict, List, Optional, Union

data_path = (
"/ssd2/tangshiyu/Code/ERNIE-Bot-SDK/erniebot-agent/benchmark/data/instruction_following_input_data.jsonl"
)


@dataclasses.dataclass
class InputExample:
key: int
instruction_id_list: List[str]
prompt: str
kwargs: List[Dict[str, Optional[Union[str, int]]]]


def read_prompt_list(input_jsonl_filename):
"""Read inputs from jsonl."""
inputs = []
with open(input_jsonl_filename, "r") as file:
for line in file:
example = json.loads(line)
breakpoint()

inputs.append(
InputExample(
key=example["key"], # 序号
instruction_id_list=example["instruction_id_list"],
prompt=example["prompt"],
kwargs=example["kwargs"],
)
)
return inputs


# build the data into the task format


if __name__ == "__main__":
inputs = read_prompt_list(data_path)
4 changes: 4 additions & 0 deletions erniebot-agent/benchmark/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
absl-py
langdetect
nltk
immutabledict
95 changes: 95 additions & 0 deletions erniebot-agent/benchmark/run_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import argparse
import asyncio
import sys
import time
from typing import List

import yaml

sys.path.append(".")

from benchmark.build_agent import AgentBuilder
from benchmark.build_tasks import TaskBuilder
from benchmark.schema import Report, Task
from benchmark.utils import recursively_search_yaml

from erniebot_agent.agents import Agent


class ReportAnalyzer:
"""
write result to file.
write following result to another file;
print output to the console.
"""

def __init__(self, agent_name: str):
self.agent_name = agent_name

def init_reports(self, reports: List[Report]):
self.reports = reports

def write_full_report(self):
"""
write full report to file.
"""
time_stamp = time.strftime("%Y%m%d-%H%M%S", time.localtime())
output_path = "result/{}/full_report_{}.txt".format(self.agent_name, time_stamp)

with open(output_path, "w") as f:
for report in self.reports:
print(report.json())
f.write(report.json())
f.write("\n")


class EBBenchmark:
def __init__(self, args):
self.agent_config_path = args.agent_config_path
self.tasks_config_dir = args.tasks_config_dir
self.agent = self._build_agent_from_config(self.agent_config_path)
self.agent_name = ".".join(
[self.agent_config["root_path"]["agent"], self.agent_config["agent"]["name"]]
)
self.tasks = self._build_tasks()
self.analyzer = ReportAnalyzer(self.agent_name)

def _build_agent_from_config(self, agent_config_path: str) -> Agent:
with open(agent_config_path, "r") as f:
self.agent_config = yaml.load(f, Loader=yaml.FullLoader)
return AgentBuilder(self.agent_config).agent

def _build_tasks(self) -> List[Task]:
task_yaml_paths = recursively_search_yaml(self.tasks_config_dir)
tasks = []
for task_yaml_path in task_yaml_paths:
with open(task_yaml_path, "r") as f:
task_config = yaml.load(f, Loader=yaml.FullLoader)
task = TaskBuilder(task_config).build_task()
tasks.append(task)
print(f"Task {task.task_name}_{task.task_id} loaded.")

return tasks

def run(self) -> List[Report]:
reports = List[Report]
for task in self.tasks:
breakpoint()
asyncio.run(task.run(self.agent)) # Agent show be initialized with full params
reports.append(task.report)
print(f"Task {task.name} finished.")

self.analyzer.init_reports(reports=reports)
self.analyzer.write_full_report()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--agent_config_path", type=str, default="./benchmark/configs/agents/default_agent.yaml"
)
parser.add_argument("--tasks_config_dir", type=str, default="./benchmark/tasks")
args = parser.parse_args()

benchmark = EBBenchmark(args)
reports = benchmark.run()
112 changes: 112 additions & 0 deletions erniebot-agent/benchmark/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import time
from dataclasses import dataclass
from typing import Any, Dict, Optional

from benchmark.tasks.task_funcs import TASK_DICT


@dataclass
class Report(object):
"""The report generate by each task."""

task_name: str
sucessful_score: float # average successful score of the task
stability: float # the stability score of the task
latency: float # the average latency of the task
task_weight: float # the weight of the task when calculating the overall score

def json(self):
record_vars = ["task_name", "successful_score", "stability", "latency", "task_weight"]

return str({var: getattr(self, var) for var in record_vars})


@dataclass
class Task(object):
"""The task that will be executed by the agent."""

task_name: str
task_id: int # the id of the task in each task set
task_weight: float
prompt: str
answer: str
eval: Dict[str, Any]
report: Optional[Report]

async def run(self, agent, stability_trys: int = 10):
"""Run the task with the agent."""
sucessful_score = []
latencies = []
correct_cnt = 0

for i in range(stability_trys):
breakpoint()
output, latency = await self.run_once(agent)
sucessfule_score = self.eval(output) # varied from 0 to 100

if sucessfule_score > 0:
correct_cnt += 1
latencies.append(latency)
sucessful_score.append(sucessfule_score)

self._build_report(
correct_cnt / stability_trys, sum(sucessful_score) / correct_cnt, sum(latencies) / correct_cnt
)

async def run_once(self, agent):
"""Run the task once."""
start = time.time()
output = await agent.run(self.prompt)
latency = time.time() - start
return output, latency

def eval_task(self, pred_output: str):
"""Evaluate the task."""
if self.eval.eval_type == "exact":
return (pred_output == self.answer) * 100
elif self.eval.eval_type == "rough-L":
pass
# TODO: implement the rough-L method
elif self.eval.eval_type == "LLM":
pass
# TODO: implement the LLM method
elif (
self.eval.eval_type == "func"
): # function is a class, we can get result of alignment through the class func
eval_funcs = self.eval.funcs
following_res = {}
for func in eval_funcs:
func_name = func["name"]
func_kwargs = func["kwargs"]
is_following = TASK_DICT[func_name](**func_kwargs)(pred_output, self.answer)
following_res[func_name] = is_following

return following_res
# TODO: implement the func method, expected output shoud get from the func
else:
raise NotImplementedError(
"The eval method {} is not implemented. Only exact, rough-L, \
LLM and func is supported.".format(
self.eval.eval_type
)
)

def _build_report(self, stability: float, sucessful_score: float, latency: float):
self.report = Report(
task_name=self.task_name,
sucessful_score=sucessful_score,
stability=stability,
latency=latency,
task_weight=self.task_weight,
)


@dataclass
class AgentArgs(object):
"""The arguments of the agent."""

agent: Dict[str, Any]
model: Dict[str, Any]
memory: Dict[str, Any]
tools: Dict[str, Any]
root_paths: Dict[str, str]
Empty file.
18 changes: 18 additions & 0 deletions erniebot-agent/benchmark/tasks/instruction_following/1000.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
task_name: "Instruction_following"
task_id: 1000
task_weight: 1
prompt: "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format for example *highlighted section part 1* *highlighted section part 2* *highlighted section part 3*."
answer: "" # func type will not have answer
eval:
eval_type: "func"
funcs:
- name: "punctuation:no_comma" # funcs can be registerd
kwargs: Null
- name: "detectable_format:number_highlighted_sections"
kwargs:
num_highlights: 3

name: "detectable_format:titled_sections"
kwargs:
relation: 'at least'
num_words: 300
Loading