forked from Hugging-Face-KREW/Ko-AgentBench
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcombine_evaluations.py
More file actions
121 lines (105 loc) ยท 3.74 KB
/
combine_evaluations.py
File metadata and controls
121 lines (105 loc) ยท 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import math
from pathlib import Path
import pandas as pd
BASE_DIR = Path("reports")
CSV_NAME = "evaluation_summary.csv"
OUTPUT = Path("reports/combined_evaluation_summary.csv")
LEVELS = [f"L{i}" for i in range(1, 8)]
LEVEL_METRICS = [
"Total_Tasks",
"Evaluated_Tasks",
"Avg_Exec_Time",
"Avg_Tokens",
"Avg_TPS",
"Avg_TTFT",
]
AGG_AVG_COLS = ["Avg_Exec_Time", "Avg_Tokens", "Avg_TPS", "Avg_TTFT"]
RRR_COLS = [f"{lvl}_RRR" for lvl in LEVELS]
SR_COLS = [f"{lvl}_SR" for lvl in LEVELS]
EPR_COLS = [f"{lvl}_EPR_CVR" for lvl in LEVELS]
PASSK_COLS = [f"{lvl}_pass@k" for lvl in LEVELS]
MODEL_VENDOR_MAP = {
"kakaocorp_kanana-1.5-8b-instruct-2505": ("Kakao", "OSS"),
"skt_A.X-4.0-Light": ("SKT", "OSS"),
"Qwen_qwen3-8B": ("Alibaba", "OSS"),
"gemini_gemini-2.5-pro": ("Google", "API"),
"gemini_gemini-2.5-flash": ("Google", "API"),
"Qwen_Qwen3-4B-Instruct-2507": ("Alibaba", "OSS"),
"K-intelligence_Midm-2.0-Base-Instruct": ("KT", "OSS"),
"anthropic_claude-sonnet-4-20250514": ("Anthropic", "API"),
"azure_gpt-4.1": ("OpenAI", "API"),
"azure_gpt-5": ("OpenAI", "API"),
"bedrock_openai.gpt-oss-120b-1:0": ("OpenAI", "OSS"),
"bedrock_openai.gpt-oss-20b-1:0": ("OpenAI", "OSS"),
"bedrock_qwen.qwen3-32b-v1:0": ("Alibaba", "OSS"),
}
SPECIAL_MAP = {
"L1_TooAcc": ("L1", "ToolAcc"),
"L1_ArgAcc": ("L1", "ArgAcc"),
"L1_CallEM": ("L1", "CallEM"),
"L1_RespOK": ("L1", "RespOK"),
"L2_SelectAcc": ("L2", "SelectAcc"),
"L3_FSM": ("L3", "FSM"),
"L3_PSM": ("L3", "PSM"),
"L3_ฮSteps_norm": ("L3", "ฮSteps_norm"),
"L4_Coverage": ("L4", "Coverage"),
"L4_SourceEPR": ("L4", "SourceEPR"),
"L5_AdaptiveRoutingScore": ("L5", "AdaptiveRoutingScore"),
"L5_FallbackSR": ("L5", "FallbackSR"),
"L6_RedundantCallRate": ("L6", "RedundantCallRate"),
"L6_EffScore": ("L6", "EffScore"),
"L7_ContextRetention": ("L7", "ContextRetention"),
"L7_RefRecall": ("L7", "RefRecall"),
}
COLUMN_ORDER = (
["Model", "Vendor", "Model Type"]
+ [
f"{lvl}_{metric}"
for metric in LEVEL_METRICS
for lvl in LEVELS
]
+ RRR_COLS
+ SR_COLS
+ EPR_COLS
+ PASSK_COLS
+ list(SPECIAL_MAP.keys())
)
def weighted_average(series: pd.Series, weights: pd.Series) -> float:
total_weight = weights.sum()
if math.isclose(total_weight, 0.0):
return float("nan")
return (series * weights).sum() / total_weight
records = []
for csv_path in BASE_DIR.glob(f"*/{CSV_NAME}"):
df = pd.read_csv(csv_path)
model = csv_path.parent.name.rsplit("_", 1)[0]
level_df = df.set_index("Level")
vendor, model_type = MODEL_VENDOR_MAP.get(model, ("Unknown", "Unknown"))
row = {"Model": model, "Vendor": vendor, "Model Type": model_type}
for lvl in LEVELS:
if lvl not in level_df.index:
continue
for metric in LEVEL_METRICS:
row[f"{lvl}_{metric}"] = level_df.at[lvl, metric]
weights = df["Evaluated_Tasks"]
for col in AGG_AVG_COLS:
row[col] = weighted_average(df[col], weights)
for lvl in LEVELS:
if lvl not in level_df.index:
continue
row[f"{lvl}_RRR"] = level_df.at[lvl, "RRR"]
row[f"{lvl}_SR"] = level_df.at[lvl, "SR"]
row[f"{lvl}_EPR_CVR"] = level_df.at[lvl, "EPR_CVR"]
row[f"{lvl}_pass@k"] = level_df.at[lvl, "pass@k"]
for new_name, (lvl, original) in SPECIAL_MAP.items():
row[new_name] = (
level_df.at[lvl, original] if lvl in level_df.index else float("nan")
)
records.append(row)
combined = pd.DataFrame(records)
for col in COLUMN_ORDER:
if col not in combined:
combined[col] = pd.NA
combined = combined[COLUMN_ORDER]
combined.to_csv(OUTPUT, index=False)
print(f"Saved merged summary to {OUTPUT}")