PaddlePaddle · shiyutang · Jan 15, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/erniebot-agent/benchmark/__init__.py b/erniebot-agent/benchmark/__init__.py
@@ -0,0 +1,2 @@
+from .build_agent import AgentBuilder
+from .build_tasks import TaskBuilder
diff --git a/erniebot-agent/benchmark/build_agent.py b/erniebot-agent/benchmark/build_agent.py
@@ -0,0 +1,55 @@
+import os
+from typing import Any, Dict
+
+os.environ["EB_AGENT_ACCESS_TOKEN"] = ""
+os.environ["EB_AGENT_LOGGING_LEVEL"] = "info"
+
+from benchmark.schema import AgentArgs
+from benchmark.utils import relative_import
+
+
+class AgentBuilder(object):
+    """The builder of the agent."""
+
+    def __init__(self, args: AgentArgs):
+        self.args = args
+        self.config_check()
+
+        self.model = self._build_model(args["model"], args["root_path"]["model"])
+        self.memory = self._build_memory(args["memory"], args["root_path"]["memory"])
+        self.tools = self._build_toolset(args["tools"], args["root_path"]["tools"]) if args["tools"] else []
+        self.agent = self._build_agent(args["agent"], args["root_path"]["agent"])
+
+    def config_check(self):
+        root_path_include = ["model", "memory", "tools", "agent"]
+        for module_name in root_path_include:
+            if module_name not in self.args["root_path"]:
+                raise ValueError(f"{module_name} module path is not specified in the root path.")
+
+            if module_name not in self.args:
+                raise ValueError(f"{module_name} config is not specified in the config file.")
+
+    def _build_model(self, model_args: Dict[str, Any], root_path):
+        """Build the model."""
+        model = relative_import(root_path, model_args["name"])
+        return model(**model_args["kwargs"])
+
+    def _build_memory(self, memory_args: Dict[str, Any], root_path):
+        """Build the memory."""
+        memory = relative_import(root_path, memory_args["name"])
+        return memory(**memory_args["kwargs"])
+
+    def _build_toolset(self, tools: Dict[str, Any], root_path):
+        """Build the toolset."""
+        # NOTE: Tool initialization does not need params.
+        toolset = []
+        for tool_arg in tools:
+            tool = relative_import(root_path, tool_arg["name"])
+            toolset.append(tool(**tool_arg["kwargs"]))
+        return toolset
+
+    def _build_agent(self, agent_args, root_path):
+        """Build the agent."""
+        # NOTE: Other way of passing in params is not included.
+        agent = relative_import(root_path, agent_args["name"])
+        return agent(llm=self.model, memory=self.memory, tools=self.tools, **agent_args["kwargs"])
diff --git a/erniebot-agent/benchmark/build_tasks.py b/erniebot-agent/benchmark/build_tasks.py
@@ -0,0 +1,22 @@
+from typing import Any, Dict
+
+from benchmark.schema import Task
+
+
+class TaskBuilder(object):
+    """The builder of the task."""
+
+    def __init__(self, task_config: Dict[str, Any]):
+        self.task_config = task_config
+
+    def build_task(self) -> Task:
+        """Build the task."""
+        return Task(
+            task_name=self.task_config["task_name"],
+            task_id=self.task_config["task_id"],
+            task_weight=self.task_config["task_weight"],
+            prompt=self.task_config["prompt"],
+            answer=self.task_config["answer"],
+            eval=self.task_config["eval"],
+            report=None,
+        )
diff --git a/erniebot-agent/benchmark/configs/agents/default_agent.yaml b/erniebot-agent/benchmark/configs/agents/default_agent.yaml
@@ -0,0 +1,20 @@
+agent:
+    name: 'FunctionAgent'
+    kwargs: {}
+
+model:
+    name: 'ERNIEBot'
+    kwargs: 
+        model: 'ernie-3.5'
+
+memory:
+    name: 'WholeMemory'
+    kwargs: {}
+
+tools: []
+
+root_path:
+    agent: 'erniebot_agent.agents'
+    memory: 'erniebot_agent.memory'
+    tools: 'erniebot_agent.tools'
+    model: 'erniebot_agent.chat_models'
diff --git a/erniebot-agent/benchmark/data/instruction_following_input_data.jsonl b/erniebot-agent/benchmark/data/instruction_following_input_data.jsonl
diff --git a/erniebot-agent/benchmark/data/transfer_data.py b/erniebot-agent/benchmark/data/transfer_data.py
@@ -0,0 +1,41 @@
+import dataclasses
+import json
+from typing import Dict, List, Optional, Union
+
+data_path = (
+    "/ssd2/tangshiyu/Code/ERNIE-Bot-SDK/erniebot-agent/benchmark/data/instruction_following_input_data.jsonl"
+)
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: List[str]
+    prompt: str
+    kwargs: List[Dict[str, Optional[Union[str, int]]]]
+
+
+def read_prompt_list(input_jsonl_filename):
+    """Read inputs from jsonl."""
+    inputs = []
+    with open(input_jsonl_filename, "r") as file:
+        for line in file:
+            example = json.loads(line)
+            breakpoint()
+
+            inputs.append(
+                InputExample(
+                    key=example["key"],  # 序号
+                    instruction_id_list=example["instruction_id_list"],
+                    prompt=example["prompt"],
+                    kwargs=example["kwargs"],
+                )
+            )
+    return inputs
+
+
+# build the data into the task format
+
+
+if __name__ == "__main__":
+    inputs = read_prompt_list(data_path)
diff --git a/erniebot-agent/benchmark/requirements.txt b/erniebot-agent/benchmark/requirements.txt
@@ -0,0 +1,4 @@
+absl-py
+langdetect
+nltk
+immutabledict
diff --git a/erniebot-agent/benchmark/run_benchmark.py b/erniebot-agent/benchmark/run_benchmark.py
@@ -0,0 +1,95 @@
+import argparse
+import asyncio
+import sys
+import time
+from typing import List
+
+import yaml
+
+sys.path.append(".")
+
+from benchmark.build_agent import AgentBuilder
+from benchmark.build_tasks import TaskBuilder
+from benchmark.schema import Report, Task
+from benchmark.utils import recursively_search_yaml
+
+from erniebot_agent.agents import Agent
+
+
+class ReportAnalyzer:
+    """
+    write result to file.
+    write following result to another file;
+    print output to the console.
+    """
+
+    def __init__(self, agent_name: str):
+        self.agent_name = agent_name
+
+    def init_reports(self, reports: List[Report]):
+        self.reports = reports
+
+    def write_full_report(self):
+        """
+        write full report to file.
+        """
+        time_stamp = time.strftime("%Y%m%d-%H%M%S", time.localtime())
+        output_path = "result/{}/full_report_{}.txt".format(self.agent_name, time_stamp)
+
+        with open(output_path, "w") as f:
+            for report in self.reports:
+                print(report.json())
+                f.write(report.json())
+                f.write("\n")
+
+
+class EBBenchmark:
+    def __init__(self, args):
+        self.agent_config_path = args.agent_config_path
+        self.tasks_config_dir = args.tasks_config_dir
+        self.agent = self._build_agent_from_config(self.agent_config_path)
+        self.agent_name = ".".join(
+            [self.agent_config["root_path"]["agent"], self.agent_config["agent"]["name"]]
+        )
+        self.tasks = self._build_tasks()
+        self.analyzer = ReportAnalyzer(self.agent_name)
+
+    def _build_agent_from_config(self, agent_config_path: str) -> Agent:
+        with open(agent_config_path, "r") as f:
+            self.agent_config = yaml.load(f, Loader=yaml.FullLoader)
+        return AgentBuilder(self.agent_config).agent
+
+    def _build_tasks(self) -> List[Task]:
+        task_yaml_paths = recursively_search_yaml(self.tasks_config_dir)
+        tasks = []
+        for task_yaml_path in task_yaml_paths:
+            with open(task_yaml_path, "r") as f:
+                task_config = yaml.load(f, Loader=yaml.FullLoader)
+                task = TaskBuilder(task_config).build_task()
+                tasks.append(task)
+                print(f"Task {task.task_name}_{task.task_id} loaded.")
+
+        return tasks
+
+    def run(self) -> List[Report]:
+        reports = List[Report]
+        for task in self.tasks:
+            breakpoint()
+            asyncio.run(task.run(self.agent))  # Agent show be initialized with full params
+            reports.append(task.report)
+            print(f"Task {task.name} finished.")
+
+        self.analyzer.init_reports(reports=reports)
+        self.analyzer.write_full_report()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--agent_config_path", type=str, default="./benchmark/configs/agents/default_agent.yaml"
+    )
+    parser.add_argument("--tasks_config_dir", type=str, default="./benchmark/tasks")
+    args = parser.parse_args()
+
+    benchmark = EBBenchmark(args)
+    reports = benchmark.run()
diff --git a/erniebot-agent/benchmark/schema.py b/erniebot-agent/benchmark/schema.py
@@ -0,0 +1,112 @@
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from benchmark.tasks.task_funcs import TASK_DICT
+
+
+@dataclass
+class Report(object):
+    """The report generate by each task."""
+
+    task_name: str
+    sucessful_score: float  # average successful score of the task
+    stability: float  # the stability score of the task
+    latency: float  # the average latency of the task
+    task_weight: float  # the weight of the task when calculating the overall score
+
+    def json(self):
+        record_vars = ["task_name", "successful_score", "stability", "latency", "task_weight"]
+
+        return str({var: getattr(self, var) for var in record_vars})
+
+
+@dataclass
+class Task(object):
+    """The task that will be executed by the agent."""
+
+    task_name: str
+    task_id: int  # the id of the task in each task set
+    task_weight: float
+    prompt: str
+    answer: str
+    eval: Dict[str, Any]
+    report: Optional[Report]
+
+    async def run(self, agent, stability_trys: int = 10):
+        """Run the task with the agent."""
+        sucessful_score = []
+        latencies = []
+        correct_cnt = 0
+
+        for i in range(stability_trys):
+            breakpoint()
+            output, latency = await self.run_once(agent)
+            sucessfule_score = self.eval(output)  # varied from 0 to 100
+
+            if sucessfule_score > 0:
+                correct_cnt += 1
+                latencies.append(latency)
+            sucessful_score.append(sucessfule_score)
+
+        self._build_report(
+            correct_cnt / stability_trys, sum(sucessful_score) / correct_cnt, sum(latencies) / correct_cnt
+        )
+
+    async def run_once(self, agent):
+        """Run the task once."""
+        start = time.time()
+        output = await agent.run(self.prompt)
+        latency = time.time() - start
+        return output, latency
+
+    def eval_task(self, pred_output: str):
+        """Evaluate the task."""
+        if self.eval.eval_type == "exact":
+            return (pred_output == self.answer) * 100
+        elif self.eval.eval_type == "rough-L":
+            pass
+            # TODO: implement the rough-L method
+        elif self.eval.eval_type == "LLM":
+            pass
+            # TODO: implement the LLM method
+        elif (
+            self.eval.eval_type == "func"
+        ):  # function is a class, we can get result of alignment through the class func
+            eval_funcs = self.eval.funcs
+            following_res = {}
+            for func in eval_funcs:
+                func_name = func["name"]
+                func_kwargs = func["kwargs"]
+                is_following = TASK_DICT[func_name](**func_kwargs)(pred_output, self.answer)
+                following_res[func_name] = is_following
+
+            return following_res
+            # TODO: implement the func method, expected output shoud get from the func
+        else:
+            raise NotImplementedError(
+                "The eval method {} is not implemented. Only exact, rough-L, \
+                    LLM and func is supported.".format(
+                    self.eval.eval_type
+                )
+            )
+
+    def _build_report(self, stability: float, sucessful_score: float, latency: float):
+        self.report = Report(
+            task_name=self.task_name,
+            sucessful_score=sucessful_score,
+            stability=stability,
+            latency=latency,
+            task_weight=self.task_weight,
+        )
+
+
+@dataclass
+class AgentArgs(object):
+    """The arguments of the agent."""
+
+    agent: Dict[str, Any]
+    model: Dict[str, Any]
+    memory: Dict[str, Any]
+    tools: Dict[str, Any]
+    root_paths: Dict[str, str]
diff --git a/erniebot-agent/benchmark/tasks/__init__.py b/erniebot-agent/benchmark/tasks/__init__.py
diff --git a/erniebot-agent/benchmark/tasks/instruction_following/1000.yaml b/erniebot-agent/benchmark/tasks/instruction_following/1000.yaml
@@ -0,0 +1,18 @@
+task_name: "Instruction_following"
+task_id: 1000
+task_weight: 1
+prompt: "Write a 300+ word summary of the wikipedia page \"https://en.wikipedia.org/wiki/Raymond_III_Count_of_Tripoli\". Do not use any commas and highlight at least 3 sections that has titles in markdown format for example *highlighted section part 1* *highlighted section part 2* *highlighted section part 3*."
+answer: ""  # func type will not have answer
+eval:
+  eval_type: "func"
+  funcs: 
+      - name: "punctuation:no_comma" # funcs can be registerd
+        kwargs: Null
+      - name: "detectable_format:number_highlighted_sections"
+        kwargs: 
+          num_highlights: 3
+
+        name: "detectable_format:titled_sections"
+        kwargs: 
+          relation: 'at least' 
+          num_words: 300
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .build_agent import AgentBuilder
		from .build_tasks import TaskBuilder