-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetrics_eval.py
More file actions
201 lines (166 loc) · 8.43 KB
/
metrics_eval.py
File metadata and controls
201 lines (166 loc) · 8.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import sys
sys.path.insert(0, os.getcwd())
from utils.infer_utils import *
import json
import pickle
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Optional
from argparse import ArgumentParser
def process_PandR(list_models: List, result_dir: str, list_tasks_PR: List, ):
dict_cate_task = dict()
dict_model_cnt = dict()
list_acc = []
for model_name in list_models:
dict_model_cnt[model_name] = dict() # save acc for each task
json_path = os.path.join(result_dir, f"{model_name}.json")
with open(json_path, 'r') as f:
dict_res = json.load(f)
for sample_idx, img_meta in dict_res.items():
category = img_meta['type']
if category == 'Robustness':
continue
task = img_meta['task']
if task not in dict_cate_task:
dict_cate_task[task] = category
# stores correct / incorrect cnt for each task
if task not in dict_model_cnt[model_name]:
dict_model_cnt[model_name][task] = [0, 0] # [correct, incorrect]
options = img_meta['choices']
gt_ans = img_meta['answer'].replace('(', '').replace(')', '').lower()
model_ans = img_meta['model_ans']
if 'cot' in model_name: # for gpt / gemini cot
model_ans_new = extract_letter_cot(model_ans)
if_correct = (model_ans_new == gt_ans)
elif 'gpt' in model_name or 'gemini' in model_name: # for gpt / gemini
model_ans_new = extract_letter(model_ans)
if_correct = (model_ans_new == gt_ans)
else: # for open sourced model
model_ans_new, if_correct, find_res = parse_res(model_ans=model_ans, options=options, gt_ans=gt_ans)
if if_correct:
# correct
dict_model_cnt[model_name][task][0] += 1
else:
# incorrect
dict_model_cnt[model_name][task][1] += 1
# print model acc:
dict_acc = {key: [sum(item), item[0], item[0]/sum(item)] for key, item in dict_model_cnt[model_name].items() if sum(item) > 0}
for task, (sum_cnt, cor_cnt, acc) in dict_acc.items():
list_acc.append([model_name, task, sum_cnt, cor_cnt, np.round(acc, 6)])
dict_formated = {key: [0]*len(list_tasks_PR) for key in list_models}
dict_formated_cor = {key: [0]*len(list_tasks_PR) for key in list_models}
dict_formated_sum = {key: [0]*len(list_tasks_PR) for key in list_models}
for item_meta in list_acc:
model_name, task, sum_cnt, cor_cnt, acc = item_meta
t_idx = list_tasks_PR.index(task)
dict_formated[model_name][t_idx] = acc
dict_formated_cor[model_name][t_idx] = cor_cnt
dict_formated_sum[model_name][t_idx] = sum_cnt
# calculate perception / reasoning / overall acc
for model_name in dict_formated_sum.keys():
percept_cnt = [0, 0]
reasoning_cnt = [0, 0]
overall_cnt = [0, 0]
for task_id, task in enumerate(list_tasks_PR):
category = dict_cate_task[task]
cor_cnt = dict_formated_cor[model_name][task_id]
all_cnt = dict_formated_sum[model_name][task_id]
overall_cnt[0] += cor_cnt
overall_cnt[1] += all_cnt
if category.lower() == 'perception':
percept_cnt[0] += cor_cnt
percept_cnt[1] += all_cnt
if category.lower() == 'reasoning':
reasoning_cnt[0] += cor_cnt
reasoning_cnt[1] += all_cnt
try:
perception_acc = np.round(percept_cnt[0]/percept_cnt[1], 6)
reasoning_acc = np.round(reasoning_cnt[0]/reasoning_cnt[1], 6)
overall_acc = np.round(overall_cnt[0]/overall_cnt[1], 6)
except:
perception_acc, reasoning_acc, overall_acc = 0, 0, 0
dict_formated[model_name].extend([perception_acc, reasoning_acc, overall_acc])
return dict_formated
def process_robustness(list_models: List, result_dir: str, dict_formated: Dict, ):
# Count robustness
dict_id_newres = dict()
dict_model_cnt = dict()
list_rob = []
for model_name in list_models:
dict_model_cnt[model_name] = [0, 0] # save cnt for each model
dict_id_newres[model_name] = dict() # save cnt for each model
dict_correct = dict()
json_path = os.path.join(result_dir, f"{model_name}.json")
with open(json_path, 'r') as f:
dict_res = json.load(f)
for sample_idx, img_meta in dict_res.items():
category = img_meta['type']
if category != 'Robustness':
continue
if_ori = False
img_name = img_meta["filename"].split('/')[-1].split('.')[0]
if '_' not in img_name:
# original image
if_ori = True
img_id = int(img_name)
else:
# recolored image
img_id = int(img_name.split('_')[0])
options = img_meta['choices']
gt_ans = img_meta['answer'].replace('(', '').replace(')', '').lower()
model_ans = img_meta['model_ans']
if img_id not in dict_id_newres[model_name]:
dict_id_newres[model_name][img_id] = [gt_ans, None, []] # [gt answer, result for original image, list of results for recolored image]
dict_correct[img_id] = [False, []] # [correct / not for original image, list of bool]
if 'cot' in model_name: # for gpt / gemini cot
model_ans_new = extract_letter_cot(model_ans)
if_correct = (model_ans_new == gt_ans)
elif 'gpt' in model_name or 'gemini' in model_name: # for gpt / gemini
model_ans_new = extract_letter(model_ans)
if_correct = (model_ans_new == gt_ans)
else: # for open sourced model
model_ans_new, if_correct, find_res = parse_res(model_ans=model_ans, options=options, gt_ans=gt_ans)
if if_ori:
dict_id_newres[model_name][img_id][1] = model_ans_new
dict_correct[img_id][0] = if_correct
else:
dict_id_newres[model_name][img_id][2].append(model_ans_new)
dict_correct[img_id][1].append(if_correct)
# cnt robust answers
for img_id, list_res in dict_id_newres[model_name].items():
gt_ans, ori_ans, list_new_ans = list_res
ori_bool, list_new_bool = dict_correct[img_id]
if ori_bool and False not in list_new_bool:
# robust
dict_model_cnt[model_name][0] += 1
else:
# not
dict_model_cnt[model_name][1] += 1
# print model robust:
dict_robust = {key: [sum(item), item[0], item[0]/sum(item)] for key, item in dict_model_cnt.items() if sum(item) > 0}
for model_name, (sum_cnt, rob_cnt, robustness) in dict_robust.items():
list_rob.append([model_name, sum_cnt, rob_cnt, np.round(robustness, 6)])
for item_meta in list_rob:
model_name, sum_cnt, rob_cnt, robustness = item_meta
dict_formated[model_name].append(robustness)
return dict_formated
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("--result_dir", type=str, default="RESULT_DIR")
parser.add_argument("--save_dir", type=str, default="SAVE_DIR")
args = parser.parse_args()
result_dir = args.result_dir
save_dir = args.save_dir
list_tasks_PR = ["Color Recognition", "Color Extraction", "Object Recognition", "Color Proportion", "Color Comparison", "Color Counting", "Object Counting", "Color Illusion", "Color Mimicry", "Color Blindness",]
# Load model inference results
list_jsons = os.listdir(result_dir)
list_models = [item.split('.')[0] for item in list_jsons if 'json' in item]
# Count acc for each task
dict_formated = process_PandR(list_models=list_models, result_dir=result_dir, list_tasks_PR=list_tasks_PR)
dict_formated = process_robustness(list_models=list_models, result_dir=result_dir, dict_formated=dict_formated, )
# Save to csv
df_result = pd.DataFrame(dict_formated,).T
df_result.columns = list_tasks_PR + ['Perception Acc', 'Reasoning Acc', 'Overall Acc'] + ['Color Robustness']
df_result = df_result.reset_index().rename(columns={'index': 'model_type',})
df_result.to_csv(os.path.join(save_dir, 'inference.csv'), index=False)