@@ -50,8 +50,10 @@ def load_evaluation_data(result_dir: str, model: str, task_dirs: List[str]) -> d
5050 if metric not in eval_mm .ScorerRegistry .get_metric_list ():
5151 logger .warning (f"Skipping unsupported metric: { metric } " )
5252 continue
53-
54- model_results [f"{ task_dir } /{ metric } " ] = aggregate_output ["overall_score" ]
53+ overall_score = aggregate_output ["overall_score" ]
54+ if metric in ["jdocqa" , "jmmmu" , "jic-vqa" , "mecha-ja" , "mmmu" ]:
55+ overall_score = overall_score * 100
56+ model_results [f"{ task_dir } /{ metric } " ] = overall_score
5557
5658 return model_results
5759
@@ -169,8 +171,8 @@ def format_output(df: pd.DataFrame, output_format: str) -> str:
169171 for col in df .columns :
170172 top1_model = df [col ].astype (float ).idxmax ()
171173 top2_model = df [col ].astype (float ).nlargest (2 ).index [- 1 ]
172- top1_score = f"{ float (df .loc [top1_model , col ]):.3g } "
173- top2_score = f"{ float (df .loc [top2_model , col ]):.3g } "
174+ top1_score = f"{ float (df .loc [top1_model , col ]):.1f } "
175+ top2_score = f"{ float (df .loc [top2_model , col ]):.1f } "
174176 # apply formatting
175177 if output_format == "latex" :
176178 df .loc [top1_model , col ] = f"\\ textbf{{{ top1_score } }}"
@@ -184,9 +186,11 @@ def format_output(df: pd.DataFrame, output_format: str) -> str:
184186 df = df .fillna ("" )
185187
186188 if output_format == "markdown" :
187- return df .to_markdown (mode = "github" , floatfmt = ".3g " )
189+ return df .to_markdown (mode = "github" , floatfmt = ".1f " )
188190 elif output_format == "latex" :
189- return df .to_latex (float_format = "%.3g" )
191+ return df .to_latex (
192+ float_format = "%.1f" , column_format = "l" + "c" * len (df .columns )
193+ )
190194 return ""
191195
192196
0 commit comments