Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
23 changes: 11 additions & 12 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@
import subprocess
import time
from collections import OrderedDict
from typing import List

import fire
import pandas as pd
from mmengine.config import Config


def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):
def run_cmd(cmd_lines: list[str], log_path: str, cwd: str = None):
"""
Args:
cmd_lines: (list[str]): A command in multiple line style.
Expand Down Expand Up @@ -43,7 +42,7 @@ def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):

if return_code != 0:
logging.error(f'Got shell abnormal return code={return_code}')
with open(log_path, 'r') as f:
with open(log_path) as f:
content = f.read()
logging.error(f'Log error message\n{content}')
return return_code
Expand All @@ -61,7 +60,7 @@ def add_summary(csv_path: str):
Args:
csv_path (str): Input csv file.
"""
with open(csv_path, 'r') as fr:
with open(csv_path) as fr:
lines = fr.readlines()
header = lines[0].strip().split(',')
n_col = len(header)
Expand All @@ -75,8 +74,8 @@ def add_summary(csv_path: str):
_append_summary('\n')


def evaluate(models: List[str],
datasets: List[str],
def evaluate(models: list[str],
datasets: list[str],
workspace: str,
evaluate_type: str,
max_num_workers: int = 8,
Expand Down Expand Up @@ -146,12 +145,12 @@ def evaluate(models: List[str],
# print csv_txt to screen
csv_txt = csv_file.replace('.csv', '.txt')
if os.path.exists(csv_txt):
with open(csv_txt, 'r') as f:
with open(csv_txt) as f:
print(f.read())

# parse evaluation results from csv file
model_results = OrderedDict()
with open(csv_file, 'r') as f:
with open(csv_file) as f:
lines = f.readlines()
for line in lines[1:]:
row = line.strip().split(',')
Expand All @@ -160,7 +159,7 @@ def evaluate(models: List[str],
model_results[row[0]] = row[-1]
crows_pairs_json = glob.glob(os.path.join(work_dir, '*/results/*/crows_pairs.json'), recursive=True)
if len(crows_pairs_json) == 1:
with open(crows_pairs_json[0], 'r') as f:
with open(crows_pairs_json[0]) as f:
acc = json.load(f)['accuracy']
acc = f'{float(acc):.2f}' # noqa E231
model_results['crows_pairs'] = acc
Expand Down Expand Up @@ -238,9 +237,9 @@ def generate_benchmark_report(report_path: str):

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in backend_subfolder:
average_values = grouped_df.pipe((lambda group: {
average_values = grouped_df.pipe(lambda group: {
'mean': group.mean(numeric_only=True).round(decimals=3)
}))['mean']
})['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df], ignore_index=True)
Expand All @@ -253,7 +252,7 @@ def generate_benchmark_report(report_path: str):


def generate_csv_from_profile_result(file_path: str, out_path: str):
with open(file_path, 'r') as f:
with open(file_path) as f:
data = f.readlines()
data = [json.loads(line) for line in data]

Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/doc_link_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def make_parser():


def analyze_doc(home, path):
print('analyze {}'.format(path))
print(f'analyze {path}')
problem_list = []
code_block = 0
with open(path) as f:
Expand Down
52 changes: 32 additions & 20 deletions .github/scripts/eval_base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,39 +11,51 @@
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.crowspairs.crowspairs_ppl import crowspairs_datasets # noqa: F401, E501
from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets # noqa: F401, E501

# Corebench v1.7
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import (
GaokaoBench_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import (
hellaswag_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import (
humaneval_datasets as humaneval_v2_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import (
humaneval_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import (
mathbench_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
BoolQ_datasets # noqa: F401, E501
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import (
BoolQ_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
wikibench_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
winogrande_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import (
triviaqa_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import (
wikibench_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import (
winogrande_datasets, # noqa: F401, E501
)

# Summary Groups
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import (
mathbench_2024_summary_groups, # noqa: F401, E501
)
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups # noqa: F401, E501

Expand Down
152 changes: 92 additions & 60 deletions .github/scripts/eval_chat_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,88 +10,120 @@
from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import ceval_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import (
GaokaoBench_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import (
hellaswag_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets # noqa: F401, E501
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets # noqa: F401, E501
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import sanitized_mbpp_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets # noqa: F401, E501
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
mmlu_pro_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets # noqa: F401, E501
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets # noqa: F401, E501
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
winogrande_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import (
triviaqa_datasets, # noqa: F401, E501
)
from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import (
winogrande_datasets, # noqa: F401, E501
)

# read models
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
models as hf_baichuan2_chat_7b # noqa: F401, E501
from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import (
models as hf_baichuan2_chat_7b, # noqa: F401, E501
)
from opencompass.configs.models.gemma.hf_gemma2_9b_it import models as hf_gemma2_9b_it # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
models as hf_internlm2_5_20b_chat # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
models as hf_internlm2_chat_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
models as hf_internlm2_chat_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as lmdeploy_internlm2_5_7b_chat # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
models as lmdeploy_internlm2_5_20b_chat # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
models as lmdeploy_internlm2_chat_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import \
models as lmdeploy_internlm2_chat_20b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
models as lmdeploy_internlm3_8b_instruct # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import \
models as lmdeploy_internlm_chat_7b # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import (
models as hf_internlm2_5_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import (
models as hf_internlm2_5_20b_chat, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import (
models as hf_internlm2_chat_7b, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import (
models as hf_internlm2_chat_20b, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
models as lmdeploy_internlm2_5_20b_chat, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import (
models as lmdeploy_internlm2_chat_7b, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import (
models as lmdeploy_internlm2_chat_20b, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
models as lmdeploy_internlm3_8b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import (
models as lmdeploy_internlm_chat_7b, # noqa: F401, E501
)
from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as hf_llama2_chat_7b # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
models as hf_llama3_1_8b_instruct # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
models as hf_llama_3_8b_instruct # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
models as lmdeploy_llama2_7b_chat # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
models as lmdeploy_llama3_1_8b_instruct # noqa: F401, E501
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
models as lmdeploy_llama3_8b_instruct # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \
models as hf_mistral_chat_7b # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_chat_8x7b # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
models as lmdeploy_qwen2_5_32b_instruct # noqa: F401, E501
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import (
models as hf_llama3_1_8b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import (
models as hf_llama_3_8b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import (
models as lmdeploy_llama2_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
models as lmdeploy_llama3_1_8b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
models as lmdeploy_llama3_8b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import (
models as hf_mistral_chat_7b, # noqa: F401, E501
)
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import (
models as hf_mixtral_chat_8x7b, # noqa: F401, E501
)
from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as hf_qwen1_5_chat_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import \
models as hf_qwen1_5_moe_a2_7b_chat # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import (
models as hf_qwen1_5_moe_a2_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import models as hf_qwen2_7b_instruct # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as hf_qwen_chat_7b # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import \
models as lmdeploy_qwen1_5_7b_chat # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
models as lmdeploy_qwen2_7b_instruct # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \
models as lmdeploy_qwen_7b_chat # noqa: F401, E501
from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import (
models as lmdeploy_qwen1_5_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
models as lmdeploy_qwen2_7b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import (
models as lmdeploy_qwen_7b_chat, # noqa: F401, E501
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
models as lmdeploy_qwen2_5_7b_instruct, # noqa: F401, E501
)
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
models as lmdeploy_qwen2_5_32b_instruct, # noqa: F401, E501
)

# Summary Groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.ds1000 import ds1000_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.humanevalx import humanevalx_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import (
mathbench_2024_summary_groups, # noqa: F401, E501
)
from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.scicode import scicode_summary_groups # noqa: F401, E501
Expand Down
Loading