-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathevaluate_test.py
More file actions
108 lines (90 loc) · 3.48 KB
/
evaluate_test.py
File metadata and controls
108 lines (90 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import sys
import os
# 添加数据分析目录到路径,以便导入grading_utils
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '数据分析'))
from grading_utils import grade_answers
def load_json_file(filepath):
"""加载JSON文件"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"错误: 文件 '{filepath}' 不存在")
return None
except json.JSONDecodeError:
print(f"错误: 文件 '{filepath}' 不是有效的JSON格式")
return None
def main():
"""主函数:评估test.json相对于答案.json的准确率"""
# 加载测试答案和标准答案
print("=" * 70)
print("测试结果准确率评分")
print("=" * 70)
print()
test_file = "test.json"
answer_file = "答案.json"
print(f"正在加载测试文件: {test_file}")
test_data = load_json_file(test_file)
if test_data is None:
return
print(f"正在加载标准答案: {answer_file}")
standard_data = load_json_file(answer_file)
if standard_data is None:
return
print()
print("-" * 70)
print("开始评分...")
print("-" * 70)
print()
# 使用grading_utils中的评分函数
result = grade_answers(test_data, standard_data)
# 显示评分结果
print("评分结果:")
print("=" * 70)
print(f"准确率: {result['accuracy']:.2f}%")
print()
print("详细统计:")
print(f" 标准答案总数: {result['total']}")
print(f" 测试答案总数: {result['answered_count']}")
print(f" 完全正确数量: {result['correct_count']}")
print(f" 缺失数量 (漏答): {result['missing_count']}")
print(f" 多余数量 (幻觉): {result['extra_count']}")
print(f" 错误数量 (答错): {result['wrong_count']}")
print(f" 编辑距离: {result['edit_distance']}")
print()
# 显示详细对比(如果有差异)
if result['accuracy'] < 100.0:
print("-" * 70)
print("差异详情:")
print("-" * 70)
# 找出所有差异
standard_keys = set(standard_data.keys())
test_keys = set(test_data.keys())
# 缺失的键
missing_keys = standard_keys - test_keys
if missing_keys:
print(f"\n缺失的键 ({len(missing_keys)}个):")
for key in sorted(missing_keys, key=lambda x: int(x)):
print(f" 键 {key}: 标准值={standard_data[key]}, 测试值=未回答")
# 多余的键(幻觉)
extra_keys = test_keys - standard_keys
if extra_keys:
print(f"\n多余的键/幻觉 ({len(extra_keys)}个):")
for key in sorted(extra_keys, key=lambda x: int(x)):
print(f" 键 {key}: 测试值={test_data[key]}, 标准答案中不存在")
# 值错误的键
wrong_keys = []
for key in standard_keys & test_keys:
if standard_data[key] != test_data[key]:
wrong_keys.append(key)
if wrong_keys:
print(f"\n值错误的键 ({len(wrong_keys)}个):")
for key in sorted(wrong_keys, key=lambda x: int(x)):
print(f" 键 {key}: 标准值={standard_data[key]}, 测试值={test_data[key]}")
else:
print("✓ 完美匹配! 测试结果与标准答案完全一致。")
print()
print("=" * 70)
if __name__ == "__main__":
main()