diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 8f36fe1a7..110d588ff 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -88,6 +88,7 @@ from .mmmath import MMMath from .mmoral_opg_closed import MMOral_OPG_CLOSED from .mmoral_opg_open import MMOral_OPG_OPEN +from .mmoral_uni import MMOral_Uni from .mmrarebench import (MMRarebenchCrossmodal, MMRarebenchDiagnosis, MMRarebenchExamination, MMRarebenchTreatment) from .mmsafetybench import MMSafetyBenchDataset @@ -299,7 +300,7 @@ def evaluate(self, eval_file, **judge_kwargs): FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles, # noqa: E501 MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset, # noqa: E501 Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED, # noqa: E501 - SciDocBench, + MMOral_Uni, SciDocBench, MMRarebenchDiagnosis, MMRarebenchTreatment, MMRarebenchCrossmodal, MMRarebenchExamination, ] diff --git a/vlmeval/dataset/mmoral_uni.py b/vlmeval/dataset/mmoral_uni.py new file mode 100644 index 000000000..b7f6653c8 --- /dev/null +++ b/vlmeval/dataset/mmoral_uni.py @@ -0,0 +1,235 @@ +import os.path as osp +import warnings +from collections import defaultdict + +import pandas as pd + +from ..smp import dump, get_intermediate_file_path, load, toliststr +from ..utils import track_progress_rich +from .image_base import ImageBaseDataset +from .utils import DEBUG_MESSAGE, build_judge + + +class MMOral_Uni(ImageBaseDataset): + TYPE = 'VQA' + MODALITY = 'IMAGE' + + DATASET_URL = { + 'MMOral_Uni': + 'https://huggingface.co/datasets/OralGPT/MMOral-Omni-Bench/resolve/main/MMOral-Omni-Bench.tsv' # noqa: E501 + } + DATASET_MD5 = { + 'MMOral_Uni': '139e90f132f02e2a87d60eff1c24254a', + } + + def __init__(self, dataset='MMOral_Uni', skip_noimg=False, categories=None): + if dataset != 'MMOral_Uni': + warnings.warn( + 'To evaluate on MMOral_Uni, we suggest using `MMOral_Uni` as the dataset name.' + ) + self._categories_filter = categories if categories else None + super().__init__(dataset=dataset, skip_noimg=skip_noimg) + + def post_build(self, dataset): + if self._categories_filter: + keywords = [str(kw) for kw in self._categories_filter] + mask = self.data['category'].apply(lambda c: any(kw in str(c) for kw in keywords)) + self.data = self.data[mask].reset_index(drop=True) + + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + + question = line['question'] + if 'image' in line and str(line['image']) == 'nan': + return [dict(type='text', value=question)] + + tgt_path = toliststr(line['image_path']) if self.meta_only else self.dump_image(line) + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=question)) + return msgs + + @staticmethod + def _build_judge_prompt(line): + question = line['question'] + gt = str(line['answer']) + prediction = str(line['prediction']) + prompt = ("You are an expert judge for an oral and dental multimodal imaging " + "benchmark. Strictly evaluate how correct the prediction is compared " + "with the ground truth.\n\n" + "Only output one numeric score: 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, " + "0.6, 0.7, 0.8, 0.9, or 1.0.\n\n" + "Evaluation rules:\n" + "- Judge clinical semantic equivalence, not wording.\n" + "- Treat wrong disease, tooth, side, stage, grade, or severity as " + "major errors.\n" + "- Accept clinically equivalent synonyms and equivalent tooth " + "numbering systems.\n" + "- Minor omissions of coordinates or tooth numbers may receive " + "partial credit when the clinical meaning remains correct.\n" + "- Penalize extra statements that contradict the ground truth or " + "introduce clinically false information.\n" + "- Refusals, generic non-answers, and fundamentally contradictory " + "answers should receive 0.0.\n\n" + "Scoring guide:\n" + "- 1.0: Fully correct, with no clinically material omissions or " + "extra wrong claims.\n" + "- 0.7-0.9: Mostly correct, with only minor omissions or " + "clinically insignificant inaccuracies.\n" + "- 0.3-0.6: Partially correct, but missing a core aspect or " + "including a major error.\n" + "- 0.1-0.2: Largely incorrect, with minimal overlap.\n" + "- 0.0: Totally incorrect.\n\n" + "Question | Ground truth | Prediction | Correctness\n" + "--- | --- | --- | ---") + return prompt + '\n' + ' | '.join( + [question, gt.replace('', ' ').replace('', ' '), prediction, '']) + + @staticmethod + def _auxeval(model, line): + + def float_cvt(s): + try: + return float(s) + except ValueError: + return None + + prompt = MMOral_Uni._build_judge_prompt(line) + log = '' + for i in range(5): + output = model.generate(prompt, temperature=i * 0.5) + score = float_cvt(output) + if score is None: + log += f'Try {i}: output is {output}, failed to parse.\n' + elif score < 0 or score > 1: + log += f'Try {i}: output is {output}, invalid score: {score}.\n' + else: + log += 'Succeed' + return dict(log=log, score=score) + log += 'All 5 retries failed.\n' + return dict(log=log, score=0.0) + + @staticmethod + def _acc(result_file): + data = load(result_file) + tot = defaultdict(lambda: 0) + score = defaultdict(lambda: 0) + cate2_list = [] + coarse_categories = [ + 'TP', + 'Endodontics', + 'Implant Dentistry', + 'Periodontics', + 'II_Loc', + 'II_Dx-I', + 'Orthodontics', + 'Cancer', + 'Gingivitis', + 'Defective Dentition', + 'Normality', + 'Tooth Discoloration', + 'Ulcer', + 'Caries', + 'Calculus', + 'II_Dx-R', + 'Fenestration and Dehiscence', + 'Malocclusion Issues Assessment', + 'PA', + 'Impacted Tooth', + 'Pulpitis', + 'Periodontitis', + 'Apical Periodontitis', + 'Mixed Dentition', + 'Bone Loss', + 'Root Canal Treatment', + 'Crown', + 'Restoration', + 'CE', + 'PI', + 'Leukoplakia with Dysplasia', + 'Leukoplakia without Dysplasia', + 'Oral Squamous Cell Carcinoma', + 'Oral Submucous Fibrosis', + 'IV', + ] + + for i in range(len(data)): + item = data.iloc[i] + cate = str(item['category']) + cate2 = cate.replace(',', '_') + if cate2 not in cate2_list: + cate2_list.append(cate2) + grade = float(item['score']) + + for capa in coarse_categories: + if capa in cate: + tot[capa] += 1 + score[capa] += grade + tot['Overall'] += 1 + tot[cate2] += 1 + score['Overall'] += grade + score[cate2] += grade + + res = defaultdict(list) + res2 = defaultdict(list) + for k in coarse_categories + ['Overall']: + res['Category'].append(k) + res['tot'].append(tot[k]) + res['acc'].append(score[k] / tot[k] * 100 if tot[k] else 0) + for v in cate2_list + ['Overall']: + res2['Category'].append(v) + res2['tot'].append(tot[v]) + res2['acc'].append(score[v] / tot[v] * 100 if tot[v] else 0) + return pd.DataFrame(res), pd.DataFrame(res2) + + @classmethod + def evaluate(cls, eval_file, **judge_kwargs): + judge_model_name = judge_kwargs.pop('model', 'gpt-5.4-mini') + storage = get_intermediate_file_path(eval_file, f'_{judge_model_name}') + tmp_file = get_intermediate_file_path(eval_file, f'_{judge_model_name}', 'pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + data = load(eval_file) + model = build_judge(model=judge_model_name, max_tokens=16384, **judge_kwargs) + assert model.working(), ( + 'MMOral_Uni evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE + ) + + lines = [data.iloc[i] for i in range(len(data))] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + cls._auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score'] + + data['score'] = [ans[idx]['score'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + score, score_fine = cls._acc(storage) + score_pth = get_intermediate_file_path(storage, '_score', 'csv') + score_fine_pth = get_intermediate_file_path(storage, '_score_fine', 'csv') + dump(score, score_pth) + dump(score_fine, score_fine_pth) + return score