diff --git a/examples/features/multi-turn-conversation/README.md b/examples/features/multi-turn-conversation/README.md new file mode 100644 index 00000000..aa18b94e --- /dev/null +++ b/examples/features/multi-turn-conversation/README.md @@ -0,0 +1,37 @@ +# Multi-Turn Conversation Evaluation + +Demonstrates evaluating multi-turn conversation quality using composable +`llm-judge` prompt templates with per-turn score breakdowns. + +## What this shows + +1. **Multi-turn input** — conversations with 4+ user/assistant turns where + context retention matters +2. **Conversation-aware judge prompts** — markdown templates that receive the + full `{{output}}` Message[] array and evaluate conversation-level qualities +3. **Per-turn score breakdown** — judges return structured `details` with + per-turn scores, not just a flat conversation score +4. **Composability** — multiple llm-judge evaluators combined with + deterministic assertions (e.g., `contains`) + +## Judge dimensions + +| Judge | What it evaluates | +|-------|-------------------| +| `context-retention.md` | Does the agent remember information from earlier turns? | +| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? | +| `role-adherence.md` | Does the agent maintain its assigned persona? | + +## Running + +```bash +agentv run default --filter multi-turn-conversation +``` + +## Creating your own conversation evaluator + +1. Create a markdown file in `judges/` +2. Use `{{ output }}` to receive the full conversation Message[] array +3. Use `{{ criteria }}` for the test-specific evaluation criteria +4. Instruct the judge to return `details` with per-turn metrics +5. Reference it in your YAML: `type: llm-judge` / `prompt: ./judges/your-judge.md` diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl new file mode 100644 index 00000000..4282abf3 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -0,0 +1,2 @@ +{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested","Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support","Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns. | conversation_relevancy: Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation. | role_adherence: All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested"],"misses":[],"reasoning":"The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":904,"output":166},"duration_ms":3625,"started_at":"2026-03-09T10:16:26.989Z","ended_at":"2026-03-09T10:16:30.614Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support"],"misses":[],"reasoning":"Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":925,"output":184},"duration_ms":1456,"started_at":"2026-03-09T10:16:30.614Z","ended_at":"2026-03-09T10:16:32.070Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":[],"reasoning":"All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":939,"output":190},"duration_ms":1438,"started_at":"2026-03-09T10:16:32.070Z","ended_at":"2026-03-09T10:16:33.508Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":0,"started_at":"2026-03-09T10:16:33.508Z","ended_at":"2026-03-09T10:16:33.508Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps","Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions","Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps. | conversation_relevancy: All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions. | role_adherence: The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps"],"misses":[],"reasoning":"The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps.","details":{"scores_per_turn":[1,1],"relevant_turns":2,"total_turns":2},"token_usage":{"input":1004,"output":181},"duration_ms":2908,"started_at":"2026-03-09T10:16:28.589Z","ended_at":"2026-03-09T10:16:31.497Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions"],"misses":[],"reasoning":"All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":1025,"output":180},"duration_ms":1801,"started_at":"2026-03-09T10:16:31.497Z","ended_at":"2026-03-09T10:16:33.298Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"reasoning":"The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":1039,"output":170},"duration_ms":1399,"started_at":"2026-03-09T10:16:33.298Z","ended_at":"2026-03-09T10:16:34.697Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml new file mode 100644 index 00000000..514df1be --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml @@ -0,0 +1,260 @@ +--- +timestamp: 2026-03-09T09:22:00.279Z +test_id: support-troubleshooting-flow +dataset: dataset.eval +score: 1 +hits: + - "Turn 3: skipped router restart, referencing user's prior attempt" + - "Turn 5: provided instructions suited to technical beginner, no repeat steps" + - "Turn 6: referenced prior cable fix and 'simple' instructions" + - "Turn 6: continued to avoid repeated router restart, encouraged easy actions" + - "Turn 3: Directly asked about router light status to diagnose further" + - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step" + - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner" + - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple" + - "Turn 1: empathetic, clear, and skips router restart as instructed" + - "Turn 2: explains orange light in simple terms and gives an easy step" + - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions" +misses: [] +target: default +reasoning: "context_retention: All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps. | conversation_relevancy: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. | role_adherence: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps." +input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that — WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: OK I did that. What if it happens again? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 3: skipped router restart, referencing user's prior attempt" + - "Turn 5: provided instructions suited to technical beginner, no repeat steps" + - "Turn 6: referenced prior cable fix and 'simple' instructions" + - "Turn 6: continued to avoid repeated router restart, encouraged easy actions" + misses: [] + reasoning: "All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps." + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + relevant_turns: 4 + total_turns: 4 + token_usage: + input: 964 + output: 174 + duration_ms: 1563 + started_at: 2026-03-09T09:21:52.423Z + ended_at: 2026-03-09T09:21:53.986Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - "Turn 3: Directly asked about router light status to diagnose further" + - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step" + - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner" + - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple" + misses: [] + reasoning: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + on_topic_turns: 4 + total_turns: 4 + token_usage: + input: 985 + output: 201 + duration_ms: 2679 + started_at: 2026-03-09T09:21:53.986Z + ended_at: 2026-03-09T09:21:56.665Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: empathetic, clear, and skips router restart as instructed" + - "Turn 2: explains orange light in simple terms and gives an easy step" + - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions" + misses: [] + reasoning: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps. + details: + scores_per_turn: + - 1 + - 1 + - 1 + consistent_turns: 3 + total_turns: 3 + token_usage: + input: 999 + output: 164 + duration_ms: 3613 + started_at: 2026-03-09T09:21:56.665Z + ended_at: 2026-03-09T09:22:00.278Z +execution_status: ok + +--- +timestamp: 2026-03-09T09:22:00.670Z +test_id: support-context-retention +dataset: dataset.eval +score: 1 +hits: + - "Turn 3: addressed Sarah by name, referenced order #98765" + - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765" + - "Turn 5: proposed solutions relevant to urgency and birthday gift context" + - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message" + - "Turn 3: directly addressed delivery delay and offered to check expedited options" + - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name" + - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message" + - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number" + - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions" + - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message" + - Output contains "#98765" +misses: [] +target: default +reasoning: "context_retention: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. | conversation_relevancy: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. | role_adherence: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. | contains-#98765: Output contains \"#98765\"" +input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: Yes please. I really need it by Friday — it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: The express shipping sounds great. Also, can you add gift wrapping? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 3: addressed Sarah by name, referenced order #98765" + - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765" + - "Turn 5: proposed solutions relevant to urgency and birthday gift context" + - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message" + misses: [] + reasoning: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + relevant_turns: 4 + total_turns: 4 + token_usage: + input: 873 + output: 193 + duration_ms: 1625 + started_at: 2026-03-09T09:21:51.635Z + ended_at: 2026-03-09T09:21:53.260Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - "Turn 3: directly addressed delivery delay and offered to check expedited options" + - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name" + - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message" + misses: [] + reasoning: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. + details: + scores_per_turn: + - 1 + - 1 + - 1 + on_topic_turns: 3 + total_turns: 3 + token_usage: + input: 894 + output: 170 + duration_ms: 3433 + started_at: 2026-03-09T09:21:53.260Z + ended_at: 2026-03-09T09:21:56.693Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number" + - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions" + - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message" + misses: [] + reasoning: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. + details: + scores_per_turn: + - 1 + - 1 + - 1 + consistent_turns: 3 + total_turns: 3 + token_usage: + input: 908 + output: 179 + duration_ms: 3976 + started_at: 2026-03-09T09:21:56.693Z + ended_at: 2026-03-09T09:22:00.669Z + - name: contains-#98765 + type: contains + score: 1 + weight: 1 + verdict: pass + hits: + - Output contains "#98765" + misses: [] + reasoning: Output contains "#98765" + duration_ms: 1 + started_at: 2026-03-09T09:22:00.669Z + ended_at: 2026-03-09T09:22:00.670Z +execution_status: ok diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml new file mode 100644 index 00000000..8635c656 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml @@ -0,0 +1,137 @@ +# Multi-Turn Conversation Evaluation Example +# Demonstrates evaluating conversation-level qualities using composable llm-judge prompts +# with structured per-turn score breakdowns via the `details` field. + +description: Multi-turn conversation evaluation with per-turn score breakdown + +execution: + target: default + +tests: + # ========================================== + # Test 1: Customer support — order issue with context retention + # The agent must remember the customer's name, order number, and delivery + # deadline across 4+ turns while maintaining a professional persona. + # ========================================== + - id: support-context-retention + criteria: |- + Agent maintains context across all turns: remembers customer name (Sarah), + order number (#98765), and Friday delivery deadline. Provides relevant, + helpful responses. Maintains professional support persona throughout. + + input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: |- + Yes please. I really need it by Friday — it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: |- + The express shipping sounds great. Also, can you add gift wrapping? + + expected_output: + - role: assistant + content: |- + Absolutely, Sarah! I've upgraded order #98765 to express shipping — it should + arrive by Thursday, well before Friday. I've also added complimentary gift + wrapping since this is a special occasion. You'll receive a tracking update + shortly. Is there anything else I can help with for your daughter's birthday? + + assert: + # Conversation-level llm-judge evaluators with per-turn details + - name: context_retention + type: llm-judge + prompt: ../judges/context-retention.md + required: true + - name: conversation_relevancy + type: llm-judge + prompt: ../judges/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-judge + prompt: ../judges/role-adherence.md + # Deterministic assertion — verifies order number is referenced + - type: contains + value: "#98765" + + # ========================================== + # Test 2: Technical support — multi-step troubleshooting + # The agent must track the problem context, steps already tried, + # and the user's technical level across turns. + # ========================================== + - id: support-troubleshooting-flow + criteria: |- + Agent tracks problem context (WiFi connectivity), steps already attempted + (router restart), and user's stated technical comfort level (beginner) + across turns. Does not repeat already-tried solutions. Adjusts explanation + complexity to match user's level. + + input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that — WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: |- + It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: |- + OK I did that. What if it happens again? + + expected_output: + - role: assistant + content: |- + Great, let's see if that helps! If the WiFi drops again in the next + 30 minutes, it might be an issue on our end. In that case, I'll schedule + a technician visit at a time that works for you — no charge. Just let me + know if it happens again. + + assert: + - name: context_retention + type: llm-judge + prompt: ../judges/context-retention.md + required: true + - name: conversation_relevancy + type: llm-judge + prompt: ../judges/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-judge + prompt: ../judges/role-adherence.md diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md new file mode 100644 index 00000000..f802ca24 --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -0,0 +1,38 @@ +You are evaluating whether an AI assistant retains context from earlier turns +in a multi-turn conversation. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +For each assistant turn, check whether the assistant correctly references or +builds on information introduced in previous turns (e.g., names, numbers, +constraints, preferences). + +Score each assistant turn: +- 1.0 if the turn demonstrates awareness of relevant earlier context +- 0.5 if the turn partially retains context (e.g., remembers some details but + forgets others) +- 0.0 if the turn ignores or contradicts earlier context + +In your `hits`, reference specific turns where context was retained +(e.g., "Turn 2: correctly recalled customer name"). +In your `misses`, reference specific turns where context was lost +(e.g., "Turn 4: forgot delivery deadline from turn 1"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `relevant_turns`: count of turns that demonstrated context retention +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md new file mode 100644 index 00000000..eb527afe --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -0,0 +1,39 @@ +You are evaluating whether each assistant response in a multi-turn conversation +is relevant to the user's current request AND the broader conversation context. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +For each assistant turn, assess: +- Does the response directly address what the user asked? +- Is the response appropriate given the full conversation history? +- Does the assistant avoid tangential or off-topic information? + +Score each assistant turn: +- 1.0 if the response is fully relevant to the user's request and context +- 0.5 if the response is partially relevant but includes unnecessary + information or misses part of the request +- 0.0 if the response is off-topic or fails to address the user's request + +In your `hits`, note turns where responses were well-targeted +(e.g., "Turn 3: directly addressed the user's shipping question"). +In your `misses`, note turns where relevance was lacking +(e.g., "Turn 2: provided unnecessary technical details"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `on_topic_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md new file mode 100644 index 00000000..fa7dc838 --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -0,0 +1,41 @@ +You are evaluating whether an AI assistant maintains its assigned persona and +role consistently across a multi-turn conversation. + +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. + +Consider the system prompt (if present) as the role definition. For each +assistant turn, assess: +- Does the assistant stay in character? +- Is the tone consistent with the assigned role? +- Does the assistant avoid breaking character or introducing behavior + inconsistent with its role? + +Score each assistant turn: +- 1.0 if the turn fully adheres to the assigned role and tone +- 0.5 if the turn partially adheres but shows inconsistency (e.g., shifts + from professional to casual) +- 0.0 if the turn breaks character or contradicts the assigned role + +In your `hits`, note turns where role was well-maintained +(e.g., "Turn 1: professional and empathetic tone matching support role"). +In your `misses`, note turns where role slipped +(e.g., "Turn 3: used overly casual language inconsistent with role"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn +- `consistent_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation (all turns with roles) ## ]] +{{ input }} + +[[ ## agent response (final turn) ## ]] +{{ answer }} diff --git a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts index 984fb8cf..14af7f25 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts @@ -68,7 +68,7 @@ function assembleFreeform( : evalCase.question; const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2), + [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2), [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2), [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(), diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 46125f3e..4bd97b6a 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -6,6 +6,7 @@ import { extractLastAssistantContent } from '../providers/types.js'; import { TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; import type { JsonObject, RubricItem } from '../types.js'; +import { isJsonObject } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; @@ -45,6 +46,10 @@ const freeformEvaluationSchema = z.object({ hits: z.array(z.string()).describe('Brief specific achievements').optional(), misses: z.array(z.string()).describe('Brief failures or omissions').optional(), reasoning: z.string().describe('Concise explanation (1-2 sentences)').optional(), + details: z + .record(z.unknown()) + .describe('Optional structured metadata for domain-specific metrics') + .optional(), }); const rubricCheckResultSchema = z.object({ @@ -115,7 +120,11 @@ export class LlmJudgeEvaluator implements Evaluator { // Prepare template variables for substitution const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input_segments, null, 2), + [TEMPLATE_VARIABLES.INPUT]: JSON.stringify( + resolveInputWithFileContent(context.evalCase.input, context.evalCase.input_segments), + null, + 2, + ), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify( context.evalCase.expected_output, null, @@ -174,6 +183,7 @@ export class LlmJudgeEvaluator implements Evaluator { expectedAspectCount, reasoning, evaluatorRawRequest, + details: data.details as JsonObject | undefined, tokenUsage, }; } catch (e: unknown) { @@ -486,7 +496,8 @@ export function buildOutputSchema(): string { ' "score": ,', ' "hits": [],', ' "misses": [],', - ' "reasoning": ""', + ' "reasoning": "",', + ' "details": {}', '}', ].join('\n'); } @@ -656,3 +667,55 @@ function calculateScoreRangeResult( }, }; } + +/** + * Resolve file references in input messages using resolved content from input_segments. + * + * `input` (TestMessage[]) contains raw file references with paths but no content. + * `input_segments` (JsonObject[]) contains resolved file segments with actual file text. + * This function produces a copy of `input` where file segments include the resolved `text`. + */ +function resolveInputWithFileContent( + input: readonly JsonObject[], + inputSegments: readonly JsonObject[], +): readonly JsonObject[] { + // Build a map of file path -> resolved text from input_segments + const fileContentsByPath = new Map(); + for (const segment of inputSegments) { + if ( + segment.type === 'file' && + typeof segment.path === 'string' && + typeof segment.text === 'string' + ) { + fileContentsByPath.set(segment.path, segment.text); + } + } + + if (fileContentsByPath.size === 0) { + return input; + } + + return input.map((message) => { + const content = message.content; + if (!Array.isArray(content)) { + return message; + } + + let modified = false; + const resolvedContent = (content as readonly JsonObject[]).map((segment) => { + if (!isJsonObject(segment)) { + return segment; + } + if (segment.type === 'file' && typeof segment.value === 'string') { + const fileText = fileContentsByPath.get(segment.value); + if (fileText !== undefined) { + modified = true; + return { ...segment, text: fileText }; + } + } + return segment; + }); + + return modified ? { ...message, content: resolvedContent } : message; + }); +} diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index c35643e1..fb21b0f2 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -94,9 +94,10 @@ File Changes: {{file_changes}} expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify input JSON stringification + // Verify input JSON stringification (includes role annotations) expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); + expect(request?.question).toContain('"role": "user"'); + expect(request?.question).toContain('"content": "User Input Message"'); // Verify expected_output JSON stringification expect(request?.question).toContain('Expected Messages: ['); @@ -190,13 +191,70 @@ Expected Messages: {{ expected_output }} expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify JSON stringified variables were also substituted + // Verify JSON stringified variables were also substituted (includes role annotations) expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); + expect(request?.question).toContain('"content": "User Input Message"'); expect(request?.question).toContain('Expected Messages: ['); expect(request?.question).toContain('"value": "Expected Output Message"'); // Verify no unreplaced template markers remain expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/); }); + + it('resolves file references in {{ input }} using input_segments content', async () => { + const testCaseWithFiles: EvalTest = { + ...baseTestCase, + input: [ + { + role: 'user', + content: [ + { type: 'file', value: 'src/app.ts' }, + { type: 'text', value: 'Review this code' }, + ], + }, + ], + input_segments: [ + { type: 'file', path: 'src/app.ts', text: 'console.log("hello world");' }, + { type: 'text', value: 'Review this code' }, + ], + }; + + const customPrompt = 'Input: {{ input }}'; + + const judgeProvider = new CapturingProvider({ + text: JSON.stringify({ + score: 0.9, + hits: ['Good'], + misses: [], + reasoning: 'OK', + }), + }); + + const evaluator = new LlmJudgeEvaluator({ + resolveJudgeProvider: async () => judgeProvider, + evaluatorTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: { ...testCaseWithFiles, evaluator: 'llm-judge' }, + candidate: 'Looks good', + target: baseTarget, + provider: judgeProvider, + attempt: 0, + promptInputs: { question: 'Review this code', guidelines: '' }, + now: new Date(), + }); + + const request = judgeProvider.lastRequest; + expect(request).toBeDefined(); + + // File content from input_segments should be resolved into the {{ input }} variable + // Content is JSON-stringified so quotes are escaped + expect(request?.question).toContain('console.log'); + expect(request?.question).toContain('hello world'); + // The resolved segment should have a "text" field with the file content + expect(request?.question).toContain('"text"'); + // Original file path should still be present + expect(request?.question).toContain('src/app.ts'); + }); });