-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdemo_evaluation.py
More file actions
132 lines (107 loc) · 6.06 KB
/
demo_evaluation.py
File metadata and controls
132 lines (107 loc) · 6.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""Demo script for the enhanced evaluation system.
This demonstrates:
1. Loading ground truth test cases
2. Running evaluations with precision, recall, F1 metrics
3. Generating comprehensive reports
4. Analyzing performance by category
"""
import os
import sys
from main import CodeReviewOrchestrator
from utils.evaluation import AgentEvaluator, create_default_test_cases
from utils.rich_output import console
def main():
"""Run evaluation demo."""
console.print("\n╔═══════════════════════════════════════════════════════════╗", style="bold cyan")
console.print("║ ║", style="bold cyan")
console.print("║ 🧪 Agent Evaluation System Demo ║", style="bold cyan")
console.print("║ Precision, Recall, F1 Score Analysis ║", style="bold cyan")
console.print("║ ║", style="bold cyan")
console.print("╚═══════════════════════════════════════════════════════════╝", style="bold cyan")
# Check for API key
if not os.getenv("GOOGLE_AI_API_KEY"):
console.print("\n❌ Error: GOOGLE_AI_API_KEY not set", style="bold red")
console.print("Set it with: export GOOGLE_AI_API_KEY='your-key'", style="yellow")
return
try:
# Initialize orchestrator
console.print("\n[1/4] 🤖 Initializing CodeReview-AI-Agent...", style="bold blue")
orchestrator = CodeReviewOrchestrator()
console.print("✓ Orchestrator initialized", style="green")
# Initialize evaluator
console.print("\n[2/4] 📊 Setting up evaluation framework...", style="bold blue")
evaluator = AgentEvaluator()
# Option 1: Load from file (if exists)
test_cases_file = "ground_truth_test_cases.json"
if os.path.exists(test_cases_file):
console.print(f" Loading test cases from {test_cases_file}...", style="cyan")
evaluator.load_test_cases_from_file(test_cases_file)
else:
# Option 2: Use default test cases
console.print(" Loading default test cases...", style="cyan")
test_cases = create_default_test_cases()
for tc in test_cases:
evaluator.add_test_case(tc)
console.print(f"✓ Loaded {len(evaluator.test_cases)} test cases", style="green")
# Show test case categories
categories = {}
for tc in evaluator.test_cases:
categories[tc.category] = categories.get(tc.category, 0) + 1
console.print("\n Test Categories:", style="cyan")
for cat, count in categories.items():
console.print(f" • {cat.capitalize()}: {count} tests", style="white")
# Run evaluation
console.print("\n[3/4] 🧪 Running evaluation suite...", style="bold blue")
console.print(" This will take a moment as each test is processed by all agents.\n", style="yellow")
summary = evaluator.evaluate_all(orchestrator)
# Display results
console.print("\n[4/4] 📈 Generating evaluation report...", style="bold blue")
evaluator.print_summary()
# Export results
output_file = "evaluation_results.json"
evaluator.export_results(output_file)
console.print(f"\n✓ Detailed results exported to: {output_file}", style="bold green")
# Show interpretation
console.print("\n" + "="*80, style="cyan")
console.print("💡 Understanding Your Metrics", style="bold yellow")
console.print("="*80, style="cyan")
metrics = summary['metrics']
console.print(f"\n📊 Overall F1 Score: {metrics['f1_score']:.3f}", style="bold")
if metrics['f1_score'] >= 0.8:
console.print(" 🌟 Excellent! Agents are performing very well.", style="green")
elif metrics['f1_score'] >= 0.6:
console.print(" 👍 Good performance, some room for improvement.", style="yellow")
else:
console.print(" ⚠️ Needs improvement. Consider tuning prompts or tools.", style="red")
console.print(f"\n🎯 Precision: {metrics['precision']:.3f}", style="bold")
if metrics['precision'] < 0.7:
console.print(" 💡 Tip: High false positives. Agents may be too aggressive.", style="yellow")
console.print(f"\n🔍 Recall: {metrics['recall']:.3f}", style="bold")
if metrics['recall'] < 0.7:
console.print(" 💡 Tip: High false negatives. Agents may be missing issues.", style="yellow")
# Show best/worst performing categories
if summary.get('category_breakdown'):
console.print("\n📁 Category Performance:", style="bold")
sorted_cats = sorted(
summary['category_breakdown'].items(),
key=lambda x: x[1]['avg_f1'],
reverse=True
)
for cat, data in sorted_cats:
emoji = "🌟" if data['avg_f1'] >= 0.7 else "⚠️"
console.print(
f" {emoji} {cat.capitalize():12} - F1: {data['avg_f1']:.3f} "
f"({data['passed']}/{data['total']} passed)",
style="white"
)
console.print("\n" + "="*80, style="cyan")
console.print("✅ Evaluation complete!", style="bold green")
console.print("="*80 + "\n", style="cyan")
except KeyboardInterrupt:
console.print("\n\n⚠️ Evaluation interrupted by user", style="yellow")
except Exception as e:
console.print(f"\n❌ Error during evaluation: {e}", style="bold red")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()