Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 43 additions & 25 deletions vlmeval/dataset/EgoExoBench/egoexobench.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
load, md5)
from vlmeval.smp.file import LMUDataRoot
from ..utils import DEBUG_MESSAGE, build_judge
from ..utils.judge_cache import (get_judge_cache_file, get_judge_detail_file, get_judge_score_file,
has_judge_failure, load_judge_cache)
from ..video_base import VideoBaseDataset
from .utils import Stack, ToTorchFormatTensor

Expand Down Expand Up @@ -255,40 +257,55 @@ def evaluate(self, eval_file, **judge_kwargs):
assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \
'data file should be an supported format (xlsx/json/tsv) file'

tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json')
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
judge_name = judge_kwargs.get('model', 'exact_matching')
tmp_file = get_judge_cache_file(eval_file, 'extract', judge_name)
legacy_tmp_file = get_intermediate_file_path(eval_file, '_tmp', 'pkl')
detail_file = get_judge_detail_file(eval_file, 'extract', judge_name)
score_file = get_judge_score_file(eval_file, judge_name, 'json')

if not osp.exists(score_file):
model = judge_kwargs.get('model', 'exact_matching')

if model == 'exact_matching':
model = None
else:
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
res = {} if not osp.exists(tmp_file) else load(tmp_file)
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
if not osp.exists(detail_file):
res = load_judge_cache(tmp_file, legacy_files=[legacy_tmp_file])

data = load(eval_file)
data_un = data[~pd.isna(data['prediction'])]
model = None
model_built = False

def get_model():
nonlocal model, model_built
if judge_name == 'exact_matching':
return None
if not model_built:
model = build_judge(**judge_kwargs)
if not model.working():
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
warnings.warn(DEBUG_MESSAGE)
model = None
model_built = True
return model

for idx in data['index']:
ans = data.loc[data['index'] == idx, 'answer'].values[0]
pred = data.loc[data['index'] == idx, 'prediction'].values[0]

if extract_characters_regex(pred) == '':
extract_pred = extract_option(
model,
data.loc[data['index'] == idx].to_dict(orient='records')[0],
'EgoExoBench_MCQ',
extract_pred = res.get(idx)
if has_judge_failure(extract_pred):
extract_pred = extract_option(
get_model(),
data.loc[data['index'] == idx].to_dict(orient='records')[0],
'EgoExoBench_MCQ',
)
res[idx] = extract_pred
dump(res, tmp_file)
data.loc[data['index'] == idx, 'judge_pred'] = extract_pred
data.loc[data['index'] == idx, 'score'] = (
-1 if extract_pred in ['Fail', ''] else int(extract_pred == ans)
)
data.loc[idx, 'score'] = int(extract_pred == ans)
else:
data.loc[idx, 'score'] = int(extract_characters_regex(pred) == ans)
extract_pred = extract_characters_regex(pred)
data.loc[data['index'] == idx, 'judge_pred'] = extract_pred
data.loc[data['index'] == idx, 'score'] = int(extract_pred == ans)

rejected = [x for x in data['score'] if x == -1]

Expand All @@ -298,8 +315,9 @@ def evaluate(self, eval_file, **judge_kwargs):
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)

dump(data, score_file)
dump(data, detail_file)

rating = get_dimension_rating(score_file)
dump(rating, tgt_file)
rating = load(score_file) if osp.exists(score_file) else get_dimension_rating(detail_file)
if not osp.exists(score_file):
dump(rating, score_file)
return rating
46 changes: 23 additions & 23 deletions vlmeval/dataset/dude.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@

import pandas as pd
from PIL import Image
from tqdm import tqdm

from vlmeval.dataset.utils.judge_cache import (get_judge_cache_file, get_judge_detail_file,
get_judge_score_file, load_judge_cache,
run_cached_tasks)
from vlmeval.smp import (LMUDataRoot, decode_base64_to_image_file, download_file, dump,
encode_image_to_base64, get_intermediate_file_path, get_logger, listinstr,
load, md5, read_ok, toliststr)
from .image_base import ImageBaseDataset
from .mmlongbench import MMLongBench_auxeval, anls_compute, concat_images
from .mmlongbench import MMLongBench_auxeval, MMLongBench_judge_failed, anls_compute, concat_images
from .utils.judge_util import build_judge

logger = get_logger(__name__)
Expand Down Expand Up @@ -176,36 +178,34 @@ def dump_image(self, origin_line):

@classmethod
def evaluate(self, eval_file, **judge_kwargs):
model = judge_kwargs['model']

storage = get_intermediate_file_path(eval_file, f'_{model}')
tmp_file = get_intermediate_file_path(eval_file, f'_{model}', 'pkl')
judge_name = judge_kwargs['model']
storage = get_judge_detail_file(eval_file, 'extract', judge_name)
tmp_file = get_judge_cache_file(eval_file, 'extract', judge_name)
legacy_tmp_file = get_intermediate_file_path(eval_file, f'_{judge_name}', 'pkl')

if osp.exists(storage):
logger.warning(f'GPT scoring file {storage} already exists, will reuse it in DUDE_eval. ')
else:
data = load(eval_file)
model = build_judge(max_tokens=128, **judge_kwargs)
lt = len(data)
lines = [data.iloc[i] for i in range(lt)]
tups = [(model, line) for line in lines]
indices = [line['index'] for line in lines]

ans = {}
if osp.exists(tmp_file):
ans = load(tmp_file)
tups = [x for x, i in zip(tups, indices) if i not in ans]
indices = [i for i in indices if i not in ans]

if len(indices):
new_results = list()
for model, line in tqdm(tups):
res = MMLongBench_auxeval(model, line)
new_results.append(res)
ans = load_judge_cache(tmp_file, legacy_files=[legacy_tmp_file])
pending = [idx for idx in indices if idx not in ans or MMLongBench_judge_failed(ans[idx])]
if pending:
model = build_judge(max_tokens=128, **judge_kwargs)
tups = [(model, line) for line in lines]
ans = run_cached_tasks(
MMLongBench_auxeval,
tups,
indices,
tmp_file,
legacy_files=[legacy_tmp_file],
failure_fn=MMLongBench_judge_failed,
)

log_map, res_map, pred_map = {}, {}, {}
all_inds = [line['index'] for line in lines]
for k, v in zip(all_inds, new_results):
for k, v in ans.items():
log_map[k] = v['log']
res_map[k] = v['res']
pred_map[k] = v['pred']
Expand All @@ -215,7 +215,7 @@ def evaluate(self, eval_file, **judge_kwargs):
dump(data, storage)

score = DUDE_acc(storage)
score_pth = get_intermediate_file_path(storage, '_score', 'csv')
score_pth = get_judge_score_file(eval_file, judge_name, 'csv')

dump(score, score_pth)
logger.info(f'DUDE successfully finished evaluating {eval_file}, results saved in {score_pth}')
Expand Down
Loading
Loading