Skip to content

Commit 3f72771

Browse files
authored
evals and enhancements without erros (#10)
1 parent 47a283d commit 3f72771

7 files changed

Lines changed: 170 additions & 26 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,5 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
209+
./agent_evals/dataset.jsonl

devops_agent/core/devops_agent.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,19 @@
1414
from rich.panel import Panel
1515

1616
from devops_agent.utils.prompt_generator_from_poml import prompt_from_poml
17-
from qdrant_client.qdrant_client import QdrantClient
1817

1918
devops_prompt = prompt_from_poml('devops.poml')
2019

21-
# qclient = QdrantClient(url=os.environ.get('QDRANT_URL'), api_key=os.environ.get('QDRANT_API_KEY'))
22-
# if not qclient.collection_exists("devops-memory"):
23-
# qclient.create_collection(collection_name="devops-memory", vectors_config=VectorParams(size=768, distance=Distance.COSINE))
24-
#
25-
# vector_db = Qdrant(collection="devops-memory", url=os.environ.get('QDRANT_URL'),
26-
# api_key=os.environ.get('QDRANT_API_KEY'),
27-
# embedder=FastEmbedEmbedder(id="snowflake/snowflake-arctic-embed-m"))
28-
#
29-
# # Create knowledge base
30-
# knowledge = Knowledge(vector_db=vector_db)
31-
3220
console = Console()
3321

34-
def execute_devops_agent(provider: str, user_query: str = None) -> Agent:
22+
def execute_devops_agent(provider: str) -> Agent:
3523
console.print(Panel.fit(
3624
"[bold cyan]DevOps Agent Invoking...[/bold cyan]",
3725
border_style="cyan"
3826
))
3927
llm_provider = provider.lower().strip()
4028
if llm_provider == 'openai':
41-
model = OpenAIChat(id="gpt-5-mini", api_key=os.environ.get('OPENAI_API_KEY'))
29+
model = OpenAIChat(id="gpt-4o", api_key=os.environ.get('OPENAI_API_KEY'))
4230
elif llm_provider == 'anthropic':
4331
model = Claude(id="claude-sonnet-4-5-20250929", temperature=0.6, api_key=os.environ.get('ANTHROPIC_API_KEY'))
4432
elif llm_provider == 'google':
@@ -66,11 +54,4 @@ def execute_devops_agent(provider: str, user_query: str = None) -> Agent:
6654
markdown=True,
6755
)
6856

69-
# response = devops_assist.run(user_query, stream_intermediate_steps=True, retry=3)
70-
#
71-
# asyncio.run(
72-
# knowledge.add_content_async(text_content=response.content, metadata={"agent_id": response.agent_id, "session_id": response.session_id})
73-
# )
74-
# return response.content
75-
7657
return devops_assist

devops_agent/core/kubernetes_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def execute_k8s_agent(provider: str, user_query: str = None) -> Agent:
4141

4242
llm_provider = provider.lower().strip()
4343
if llm_provider == 'openai':
44-
model = OpenAIChat(id="gpt-5-mini", api_key=os.environ.get('OPENAI_API_KEY'))
44+
model = OpenAIChat(id="gpt-4o", api_key=os.environ.get('OPENAI_API_KEY'))
4545
elif llm_provider == 'anthropic':
4646
model = Claude(id="claude-sonnet-4-5-20250929", temperature=0.6, api_key=os.environ.get('ANTHROPIC_API_KEY'))
4747
elif llm_provider == 'google':

devops_agent/core/log_analysis_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def execute_log_analysis_agent(provider: str, log_file: Path) -> Agent:
1818
))
1919
llm_provider = provider.lower().strip()
2020
if llm_provider == 'openai':
21-
model = OpenAIChat(id="gpt-5-mini", api_key=os.environ.get('OPENAI_API_KEY'))
21+
model = OpenAIChat(id="gtp-4o", api_key=os.environ.get('OPENAI_API_KEY'))
2222
elif llm_provider == 'anthropic':
2323
model = Claude(id="claude-sonnet-4-5-20250929", temperature=0.6, api_key=os.environ.get('ANTHROPIC_API_KEY'))
2424
elif llm_provider == 'google':

devops_agent/core/master_agent.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@
3838
knowledge = Knowledge(vector_db=vector_db)
3939

4040

41-
def execute_master_agent(provider: str, user_query: str = None, log_file: Path = None) -> str:
41+
def execute_master_agent(provider: str, user_query: str = None) -> str:
4242
llm_provider = provider.lower().strip()
4343
if llm_provider == 'openai':
44-
model = OpenAIChat(id="gpt-5-mini", api_key=os.environ.get('OPENAI_API_KEY'))
44+
model = OpenAIChat(id="gpt-4o", api_key=os.environ.get('OPENAI_API_KEY'))
4545
elif llm_provider == 'anthropic':
4646
model = Claude(id="claude-sonnet-4-5-20250929", temperature=0.6, api_key=os.environ.get('ANTHROPIC_API_KEY'))
4747
elif llm_provider == 'google':

devops_agent/core/terraform_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def execute_terraform_agent(provider: str) -> Agent:
2323

2424
llm_provider = provider.lower().strip()
2525
if llm_provider == 'openai':
26-
model = OpenAIChat(id="gpt-5-mini", api_key=os.environ.get('OPENAI_API_KEY'))
26+
model = OpenAIChat(id="gpt-4o", api_key=os.environ.get('OPENAI_API_KEY'))
2727
elif llm_provider == 'anthropic':
2828
model = Claude(id="claude-sonnet-4-5-20250929", temperature=0.6, api_key=os.environ.get('ANTHROPIC_API_KEY'))
2929
elif llm_provider == 'google':

evals-readme.md

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# 🏆 DevOps Agent Evaluation Report
2+
3+
### Comprehensive Evaluation of AI Agents on Docker, Kubernetes Production Scenarios
4+
5+
*Comparing OpenAI Gpt-4o, Anthropic Claude 4.1, and Google Gemini 2.5 Flash*
6+
7+
---
8+
9+
## 📊 Final Rankings
10+
11+
| Rank | Agent | Average Score | Performance |
12+
|:----:|:------|:-------------:|:-----------:|
13+
| 🥇 | **Anthropic Claude 4.1** | **4.52/5** | ⭐⭐⭐⭐⭐ |
14+
| 🥈 | **Google Gemini 2.5 Flash** | **4.14/5** | ⭐⭐⭐⭐ |
15+
| 🥉 | **OpenAI** | **4.04/5** | ⭐⭐⭐⭐ |
16+
17+
---
18+
19+
## 📈 Detailed Score Breakdown
20+
21+
### 🤖 OpenAI Agent Results
22+
23+
| # | Question | Score | Status |
24+
|:-:|:---------|:-----:|:------:|
25+
| 1 | 🐳 Docker ENTRYPOINT Signal Handling | **4.7/5** | ✅ Strong |
26+
| 2 | 🌐 DNS Query Storm Mitigation | **4.2/5** | ✅ Good |
27+
| 3 | 📡 gRPC Streaming Node Drains | **3.8/5** | ⚠️ Fair |
28+
| 4 | 💾 CSI Driver Deadlocks | **4.0/5** | ✅ Good |
29+
| 5 | 📊 VPA Over-recommendation | **3.5/5** | ⚠️ Fair |
30+
31+
**Average: 4.04/5** 📊
32+
33+
---
34+
35+
### 🧠 Anthropic Claude 4.1 Agent Results
36+
37+
| # | Question | Score | Status |
38+
|:-:|:---------|:-----:|:------:|
39+
| 1 | 🐳 Docker ENTRYPOINT Signal Handling | **4.8/5** | ⭐ Excellent |
40+
| 2 | 🌐 DNS Query Storm Mitigation | **4.5/5** | ✅ Strong |
41+
| 3 | 📡 gRPC Streaming Node Drains | **4.6/5** | ✅ Strong |
42+
| 4 | 💾 CSI Driver Deadlocks | **4.3/5** | ✅ Strong |
43+
| 5 | 📊 VPA Over-recommendation | **4.4/5** | ✅ Strong |
44+
45+
**Average: 4.52/5** 🏆
46+
47+
---
48+
49+
### 🔷 Google Gemini 2.5 Flash Agent Results
50+
51+
| # | Question | Score | Status |
52+
|:-:|:---------|:-----:|:------:|
53+
| 1 | 🐳 Docker ENTRYPOINT Signal Handling | **4.5/5** | ✅ Strong |
54+
| 2 | 🌐 DNS Query Storm Mitigation | **3.9/5** | ✅ Good |
55+
| 3 | 📡 gRPC Streaming Node Drains | **4.4/5** | ✅ Strong |
56+
| 4 | 💾 CSI Driver Deadlocks | **3.7/5** | ⚠️ Fair |
57+
| 5 | 📊 VPA Over-recommendation | **4.2/5** | ✅ Good |
58+
59+
**Average: 4.14/5** 📊
60+
61+
---
62+
63+
## 🎯 Performance Comparison
64+
65+
### Score Differential Analysis
66+
- Claude 4.1 vs OpenAI: +0.48 points (+11.9% improvement)
67+
- Claude 4.1 vs Gemini: +0.38 points (+9.2% improvement)
68+
- Gemini vs OpenAI: +0.10 points (+2.5% improvement)
69+
70+
---
71+
72+
## 🔍 Key Findings
73+
74+
### 🏆 Claude 4.1 Strengths
75+
-**Most Consistent Performance**: All scores ≥4.3
76+
-**Best at Complex Architectures**: Excels at gRPC (4.6) and VPA (4.4)
77+
-**Superior Code Examples**: Production-ready implementations
78+
-**Kubernetes-Native Solutions**: Leverages built-in K8s mechanisms effectively
79+
80+
### 🔷 Gemini 2.5 Flash Profile
81+
-**Strong on Core Problems**: Docker ENTRYPOINT (4.5), gRPC (4.4)
82+
- ⚠️ **Weaker on CSI Mechanisms**: Missed Kubernetes-specific CSI features (3.7)
83+
- 📈 **Second Best Overall**: Solid middle-ground performance
84+
- 🎯 **Good Operational Guidance**: Strong on incident response
85+
86+
### 🤖 OpenAI Profile
87+
- ⚠️ **Weakest on Complex Multi-Component**: gRPC (3.8), VPA (3.5)
88+
-**Good Operational Practices**: Strong monitoring and process guidance
89+
- 📉 **Misses Technical Depth**: Often lacks Kubernetes-native solutions
90+
- 🔧 **Room for Improvement**: Especially on advanced K8s features
91+
92+
---
93+
94+
## 📋 Test Scenarios
95+
96+
### Question Breakdown
97+
98+
| Icon | Scenario | Focus Area |
99+
|:----:|:---------|:-----------|
100+
| 🐳 | **Docker ENTRYPOINT** | Container signal handling & graceful shutdown |
101+
| 🌐 | **DNS Query Storm** | CoreDNS mitigation & rate limiting |
102+
| 📡 | **gRPC Streaming** | Lossless node drains & connection management |
103+
| 💾 | **CSI Driver Deadlocks** | Blast radius limitation & auto-healing |
104+
| 📊 | **VPA Over-recommendation** | Resource stabilization post-JVM upgrade |
105+
106+
---
107+
108+
## 🎓 Evaluation Methodology
109+
110+
### Scoring Criteria (Per Question)
111+
112+
-**Coverage of Ground Truth** (40%)
113+
-**Technical Accuracy** (30%)
114+
-**Production Readiness** (20%)
115+
-**Code Quality & Examples** (10%)
116+
117+
### Rating Scale
118+
119+
| Score | Rating | Description |
120+
|:-----:|:------:|:------------|
121+
| 4.5-5.0 | ⭐ Excellent | Complete solution with best practices |
122+
| 4.0-4.4 | ✅ Strong | Solid solution with minor gaps |
123+
| 3.5-3.9 | ✅ Good | Functional but missing key elements |
124+
| 3.0-3.4 | ⚠️ Fair | Partial solution, significant gaps |
125+
| <3.0 | ❌ Weak | Inadequate solution |
126+
127+
---
128+
129+
## 💡 Recommendations
130+
131+
### For Production Use
132+
133+
#### 🥇 **Anthropic Claude 4.1** (Recommended)
134+
- Best choice for **complex Kubernetes architectures**
135+
- Most **consistent and reliable** across all scenarios
136+
- Superior for **critical production incidents**
137+
- **Use when**: Complex multi-component problems, architectural decisions, mission-critical scenarios
138+
139+
#### 🥈 **Google Gemini 2.5 Flash** (Solid Alternative)
140+
- Good choice for **general Kubernetes operations**
141+
- **Cost-effective** alternative with solid performance
142+
- Best for **standard operational tasks**
143+
- **Use when**: Day-to-day operations, standard troubleshooting, budget-conscious deployments
144+
145+
#### 🥉 **OpenAI** (Basic Guidance)
146+
- Suitable for **basic Kubernetes guidance**
147+
- Strong on **process and monitoring**
148+
- May require **additional validation** for complex scenarios
149+
- **Use when**: Simple operational questions, process documentation, monitoring setup
150+
151+
---
152+
153+
## 📊 Statistical Summary
154+
```yaml
155+
Total Questions: 5
156+
Total Evaluations: 15 (3 agents × 5 questions)
157+
Average Score (All Agents): 4.23/5
158+
Standard Deviation: 0.31
159+
Highest Individual Score: 4.8/5 (Claude 4.1 - Docker ENTRYPOINT)
160+
Lowest Individual Score: 3.5/5 (OpenAI - VPA Over-recommendation)
161+
Score Range: 1.3 points

0 commit comments

Comments
 (0)