-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_flows.py
More file actions
119 lines (97 loc) · 4.22 KB
/
debug_flows.py
File metadata and controls
119 lines (97 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""Debug script for flow testing"""
import sys
sys.path.insert(0, '.')
from testllm.flows import ConversationFlow
from testllm.core import AgentUnderTest
from unittest.mock import patch, AsyncMock
from testllm.evaluation_loop import ConsensusResult
class DebugAgent(AgentUnderTest):
def send_message(self, message: str) -> str:
return f"Mock response to: {message}"
def reset_conversation(self):
pass
# Create a simple test
agent = DebugAgent()
flow = ConversationFlow('test')
flow.step('Hello', ['Be friendly'])
print("=== Basic test ===")
# Mock evaluation
with patch('testllm.flows.EvaluationLoop') as mock_eval_class:
mock_evaluator = AsyncMock()
def mock_evaluate_response(user_input, agent_response, criteria):
print(f'Criteria received: {[c.criterion for c in criteria]}')
results = [ConsensusResult(criterion.criterion, 1.0, True, []) for criterion in criteria]
print(f'Returning {len(results)} results, all passed: {[r.passed for r in results]}')
return results
mock_evaluator.evaluate_response.side_effect = mock_evaluate_response
mock_eval_class.return_value = mock_evaluator
result = flow.execute_sync(agent)
print(f'Flow result: passed={result.passed}, overall_score={result.overall_score}')
print(f'Step results: {len(result.step_results)}')
for sr in result.step_results:
print(f' Step {sr.test_id}: passed={sr.passed}, score={sr.overall_score}, errors={sr.errors}')
print("\n=== Test matching the failing integration test ===")
# Replicate the FULL failing test pattern
flow2 = ConversationFlow('e2e_onboarding', 'End-to-end customer onboarding')
# Step 1: Initial contact
flow2.step(
"Hello, I'm a new customer interested in your services",
criteria=[
"Response should acknowledge new customer status",
"Response should begin onboarding process",
"Response should be professional and welcoming"
]
)
# Step 2: Information gathering
flow2.step(
"My name is Sarah Johnson and I need a business account",
criteria=[
"Response should acknowledge the name Sarah Johnson",
"Response should understand business account requirement",
"Response should ask for relevant business information"
],
expect_context_retention=True
)
# Step 3: Memory validation
flow2.context_check(
"What type of account was I requesting?",
context_criteria=[
"Response should remember business account request",
"Response should demonstrate conversation awareness"
]
)
# Step 4: Process completion
flow2.business_logic_check(
"I'm ready to complete the setup process",
business_rules=["account_creation", "verification"],
criteria=[
"Response should outline completion steps",
"Response should mention verification requirements"
]
)
# Check what all steps look like
print(f"Total steps: {len(flow2.steps)}")
for i, step in enumerate(flow2.steps):
print(f"Step {i+1}: {step.user_input}")
print(f" Business logic expectations: {step.expect_business_logic}")
print("Starting full integration test...")
# Mock evaluation
with patch('testllm.flows.EvaluationLoop') as mock_eval_class:
mock_evaluator = AsyncMock()
def mock_evaluate_response(user_input, agent_response, criteria):
print(f'User input: {user_input}')
print(f'Agent response: {agent_response}')
print(f'Criteria received: {[c.criterion for c in criteria]}')
results = [ConsensusResult(criterion.criterion, 1.0, True, []) for criterion in criteria]
print(f'Returning {len(results)} results, all passed: {[r.passed for r in results]}')
return results
mock_evaluator.evaluate_response.side_effect = mock_evaluate_response
mock_eval_class.return_value = mock_evaluator
result = flow2.execute_sync(agent)
print(f'Flow result: passed={result.passed}, overall_score={result.overall_score}')
print(f'Business logic score: {result.business_logic_score}')
print(f'Step results: {len(result.step_results)}')
for sr in result.step_results:
print(f' Step {sr.test_id}: passed={sr.passed}, score={sr.overall_score}, errors={sr.errors}')
print(f' Criteria: {sr.criteria}')