From 98fa6e4858c8d27104d9ffeeaab0761685dc9e8f Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 06:46:25 +0000 Subject: [PATCH 01/13] feat: add details field to freeform llm-judge schema Adds optional details field to freeform evaluation schema for consistency with code-judge and score-range rubric mode. Allows llm-judge prompts to return structured domain-specific metrics alongside score/hits/misses. Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/evaluators/llm-judge.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 46125f3e..2bdf7a75 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -45,6 +45,7 @@ const freeformEvaluationSchema = z.object({ hits: z.array(z.string()).describe('Brief specific achievements').optional(), misses: z.array(z.string()).describe('Brief failures or omissions').optional(), reasoning: z.string().describe('Concise explanation (1-2 sentences)').optional(), + details: z.record(z.unknown()).describe('Optional structured metadata for domain-specific metrics').optional(), }); const rubricCheckResultSchema = z.object({ @@ -174,6 +175,7 @@ export class LlmJudgeEvaluator implements Evaluator { expectedAspectCount, reasoning, evaluatorRawRequest, + details: data.details as JsonObject | undefined, tokenUsage, }; } catch (e: unknown) { @@ -486,7 +488,8 @@ export function buildOutputSchema(): string { ' "score": ,', ' "hits": [],', ' "misses": [],', - ' "reasoning": ""', + ' "reasoning": "",', + ' "details": {}', '}', ].join('\n'); } From 5a6c9d33d97f5b5d3c29a6f5af79b4064d22273d Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 06:49:52 +0000 Subject: [PATCH 02/13] docs: add multi-turn conversation eval dataset with composable asserts --- .../evals/dataset.eval.yaml | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.yaml diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml new file mode 100644 index 00000000..8635c656 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml @@ -0,0 +1,137 @@ +# Multi-Turn Conversation Evaluation Example +# Demonstrates evaluating conversation-level qualities using composable llm-judge prompts +# with structured per-turn score breakdowns via the `details` field. + +description: Multi-turn conversation evaluation with per-turn score breakdown + +execution: + target: default + +tests: + # ========================================== + # Test 1: Customer support — order issue with context retention + # The agent must remember the customer's name, order number, and delivery + # deadline across 4+ turns while maintaining a professional persona. + # ========================================== + - id: support-context-retention + criteria: |- + Agent maintains context across all turns: remembers customer name (Sarah), + order number (#98765), and Friday delivery deadline. Provides relevant, + helpful responses. Maintains professional support persona throughout. + + input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: |- + Yes please. I really need it by Friday — it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: |- + The express shipping sounds great. Also, can you add gift wrapping? + + expected_output: + - role: assistant + content: |- + Absolutely, Sarah! I've upgraded order #98765 to express shipping — it should + arrive by Thursday, well before Friday. I've also added complimentary gift + wrapping since this is a special occasion. You'll receive a tracking update + shortly. Is there anything else I can help with for your daughter's birthday? + + assert: + # Conversation-level llm-judge evaluators with per-turn details + - name: context_retention + type: llm-judge + prompt: ../judges/context-retention.md + required: true + - name: conversation_relevancy + type: llm-judge + prompt: ../judges/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-judge + prompt: ../judges/role-adherence.md + # Deterministic assertion — verifies order number is referenced + - type: contains + value: "#98765" + + # ========================================== + # Test 2: Technical support — multi-step troubleshooting + # The agent must track the problem context, steps already tried, + # and the user's technical level across turns. + # ========================================== + - id: support-troubleshooting-flow + criteria: |- + Agent tracks problem context (WiFi connectivity), steps already attempted + (router restart), and user's stated technical comfort level (beginner) + across turns. Does not repeat already-tried solutions. Adjusts explanation + complexity to match user's level. + + input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that — WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: |- + It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: |- + OK I did that. What if it happens again? + + expected_output: + - role: assistant + content: |- + Great, let's see if that helps! If the WiFi drops again in the next + 30 minutes, it might be an issue on our end. In that case, I'll schedule + a technician visit at a time that works for you — no charge. Just let me + know if it happens again. + + assert: + - name: context_retention + type: llm-judge + prompt: ../judges/context-retention.md + required: true + - name: conversation_relevancy + type: llm-judge + prompt: ../judges/conversation-relevancy.md + weight: 2 + - name: role_adherence + type: llm-judge + prompt: ../judges/role-adherence.md From 444f048f729f628c0b12736a9725c46c21fcfa63 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 06:50:04 +0000 Subject: [PATCH 03/13] docs: add README for multi-turn conversation evaluation example --- .../multi-turn-conversation/README.md | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 examples/features/multi-turn-conversation/README.md diff --git a/examples/features/multi-turn-conversation/README.md b/examples/features/multi-turn-conversation/README.md new file mode 100644 index 00000000..aa18b94e --- /dev/null +++ b/examples/features/multi-turn-conversation/README.md @@ -0,0 +1,37 @@ +# Multi-Turn Conversation Evaluation + +Demonstrates evaluating multi-turn conversation quality using composable +`llm-judge` prompt templates with per-turn score breakdowns. + +## What this shows + +1. **Multi-turn input** — conversations with 4+ user/assistant turns where + context retention matters +2. **Conversation-aware judge prompts** — markdown templates that receive the + full `{{output}}` Message[] array and evaluate conversation-level qualities +3. **Per-turn score breakdown** — judges return structured `details` with + per-turn scores, not just a flat conversation score +4. **Composability** — multiple llm-judge evaluators combined with + deterministic assertions (e.g., `contains`) + +## Judge dimensions + +| Judge | What it evaluates | +|-------|-------------------| +| `context-retention.md` | Does the agent remember information from earlier turns? | +| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? | +| `role-adherence.md` | Does the agent maintain its assigned persona? | + +## Running + +```bash +agentv run default --filter multi-turn-conversation +``` + +## Creating your own conversation evaluator + +1. Create a markdown file in `judges/` +2. Use `{{ output }}` to receive the full conversation Message[] array +3. Use `{{ criteria }}` for the test-specific evaluation criteria +4. Instruct the judge to return `details` with per-turn metrics +5. Reference it in your YAML: `type: llm-judge` / `prompt: ./judges/your-judge.md` From ede62f6e87088c01b0a295c04e43cd17c6b32aee Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 06:50:19 +0000 Subject: [PATCH 04/13] docs: add llm-judge prompt templates for multi-turn conversation eval Co-Authored-By: Claude Opus 4.6 --- .../judges/context-retention.md | 30 +++++++++++++++++ .../judges/conversation-relevancy.md | 31 +++++++++++++++++ .../judges/role-adherence.md | 33 +++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 examples/features/multi-turn-conversation/judges/context-retention.md create mode 100644 examples/features/multi-turn-conversation/judges/conversation-relevancy.md create mode 100644 examples/features/multi-turn-conversation/judges/role-adherence.md diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md new file mode 100644 index 00000000..ebbe7f9a --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -0,0 +1,30 @@ +You are evaluating whether an AI assistant retains context from earlier turns +in a multi-turn conversation. + +Analyze the full conversation output below. For each assistant turn, check +whether the assistant correctly references or builds on information introduced +in previous turns (e.g., names, numbers, constraints, preferences). + +Score each assistant turn: +- 1.0 if the turn demonstrates awareness of relevant earlier context +- 0.5 if the turn partially retains context (e.g., remembers some details but + forgets others) +- 0.0 if the turn ignores or contradicts earlier context + +In your `hits`, reference specific turns where context was retained +(e.g., "Turn 2: correctly recalled customer name"). +In your `misses`, reference specific turns where context was lost +(e.g., "Turn 4: forgot delivery deadline from turn 1"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `relevant_turns`: count of turns that demonstrated context retention +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation ## ]] +{{ output }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md new file mode 100644 index 00000000..6bbada03 --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -0,0 +1,31 @@ +You are evaluating whether each assistant response in a multi-turn conversation +is relevant to the user's current request AND the broader conversation context. + +Analyze the full conversation output below. For each assistant turn, assess: +- Does the response directly address what the user asked? +- Is the response appropriate given the full conversation history? +- Does the assistant avoid tangential or off-topic information? + +Score each assistant turn: +- 1.0 if the response is fully relevant to the user's request and context +- 0.5 if the response is partially relevant but includes unnecessary + information or misses part of the request +- 0.0 if the response is off-topic or fails to address the user's request + +In your `hits`, note turns where responses were well-targeted +(e.g., "Turn 3: directly addressed the user's shipping question"). +In your `misses`, note turns where relevance was lacking +(e.g., "Turn 2: provided unnecessary technical details"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `on_topic_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation ## ]] +{{ output }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md new file mode 100644 index 00000000..16eabf5c --- /dev/null +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -0,0 +1,33 @@ +You are evaluating whether an AI assistant maintains its assigned persona and +role consistently across a multi-turn conversation. + +Analyze the full conversation output below. Consider the system prompt (if +present) as the role definition. For each assistant turn, assess: +- Does the assistant stay in character? +- Is the tone consistent with the assigned role? +- Does the assistant avoid breaking character or introducing behavior + inconsistent with its role? + +Score each assistant turn: +- 1.0 if the turn fully adheres to the assigned role and tone +- 0.5 if the turn partially adheres but shows inconsistency (e.g., shifts + from professional to casual) +- 0.0 if the turn breaks character or contradicts the assigned role + +In your `hits`, note turns where role was well-maintained +(e.g., "Turn 1: professional and empathetic tone matching support role"). +In your `misses`, note turns where role slipped +(e.g., "Turn 3: used overly casual language inconsistent with role"). + +In your `details`, return: +- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `consistent_turns`: count of turns scored 1.0 +- `total_turns`: total number of assistant turns evaluated + +Your overall `score` should be the average of per-turn scores. + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## conversation ## ]] +{{ output }} From b366c43cbca96ae80c91bbe7fc3bf5c931139007 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 07:17:05 +0000 Subject: [PATCH 05/13] style: fix biome formatting for details field chain Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/evaluators/llm-judge.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 2bdf7a75..1fb81e71 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -45,7 +45,10 @@ const freeformEvaluationSchema = z.object({ hits: z.array(z.string()).describe('Brief specific achievements').optional(), misses: z.array(z.string()).describe('Brief failures or omissions').optional(), reasoning: z.string().describe('Concise explanation (1-2 sentences)').optional(), - details: z.record(z.unknown()).describe('Optional structured metadata for domain-specific metrics').optional(), + details: z + .record(z.unknown()) + .describe('Optional structured metadata for domain-specific metrics') + .optional(), }); const rubricCheckResultSchema = z.object({ From 8603baf4fa978ccb3620d1228f241205826d0cb9 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 08:41:46 +0000 Subject: [PATCH 06/13] test: add e2e baseline and fix judge templates to include {{ answer }} Judge templates require {{ answer }} or {{ expected_output }} to pass prompt validation. Added {{ answer }} section to all 3 judge templates. Generated baseline from successful e2e run (2/2 tests passing). Co-Authored-By: Claude Opus 4.6 --- .../multi-turn-conversation/evals/dataset.eval.baseline.jsonl | 2 ++ .../multi-turn-conversation/judges/context-retention.md | 3 +++ .../multi-turn-conversation/judges/conversation-relevancy.md | 3 +++ .../features/multi-turn-conversation/judges/role-adherence.md | 3 +++ 4 files changed, 11 insertions(+) create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl new file mode 100644 index 00000000..4bdb9aa0 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -0,0 +1,2 @@ +{"timestamp":"2026-03-09T08:40:41.172Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledges user already restarted router ('thanks for trying that')","Turn 1: Maintains focus on WiFi connectivity issue","Turn 1: Provides beginner-friendly, stepwise troubleshooting","Turn 1: Offers escalation options without repeating prior steps","Turn 1: Provided step-by-step guidance relevant to ongoing WiFi connectivity issue","Turn 1: Avoided repeating already-attempted solution (router restart)","Turn 1: Adjusted explanation to a beginner level with clear instructions","Turn 1: Offered escalation options if problem persists","Turn 1: Maintains a professional and supportive tone throughout","Turn 1: Avoids repeating already-tried solution (router restart)","Turn 1: Provides clear, simple steps suitable for a beginner","Turn 1: Tracks problem context and offers next logical troubleshooting steps"],"misses":[],"target":"default","reasoning":"context_retention: The assistant correctly references the user's prior action (router restart), stays on-topic with troubleshooting, avoids repeating already-tried steps, and adjusts explanations for a beginner, retaining all relevant context. | conversation_relevancy: The response is entirely relevant, tracks the specific issue and past steps, does not repeat prior solutions, and is tailored to a beginner's needs as indicated in the context. | role_adherence: The assistant stays perfectly in character as a support agent: empathetic, clear, and appropriately simple in its explanations, without repetition or unnecessary complexity.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledges user already restarted router ('thanks for trying that')","Turn 1: Maintains focus on WiFi connectivity issue","Turn 1: Provides beginner-friendly, stepwise troubleshooting","Turn 1: Offers escalation options without repeating prior steps"],"misses":[],"reasoning":"The assistant correctly references the user's prior action (router restart), stays on-topic with troubleshooting, avoids repeating already-tried steps, and adjusts explanations for a beginner, retaining all relevant context.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":769,"output":168},"duration_ms":2943,"started_at":"2026-03-09T08:40:34.055Z","ended_at":"2026-03-09T08:40:36.998Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: Provided step-by-step guidance relevant to ongoing WiFi connectivity issue","Turn 1: Avoided repeating already-attempted solution (router restart)","Turn 1: Adjusted explanation to a beginner level with clear instructions","Turn 1: Offered escalation options if problem persists"],"misses":[],"reasoning":"The response is entirely relevant, tracks the specific issue and past steps, does not repeat prior solutions, and is tailored to a beginner's needs as indicated in the context.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":790,"output":169},"duration_ms":2501,"started_at":"2026-03-09T08:40:36.998Z","ended_at":"2026-03-09T08:40:39.499Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Maintains a professional and supportive tone throughout","Turn 1: Avoids repeating already-tried solution (router restart)","Turn 1: Provides clear, simple steps suitable for a beginner","Turn 1: Tracks problem context and offers next logical troubleshooting steps"],"misses":[],"reasoning":"The assistant stays perfectly in character as a support agent: empathetic, clear, and appropriately simple in its explanations, without repetition or unnecessary complexity.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":804,"output":160},"duration_ms":1672,"started_at":"2026-03-09T08:40:39.499Z","ended_at":"2026-03-09T08:40:41.171Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T08:40:41.217Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 1: correctly used customer name (Sarah)","Turn 1: referenced delivery deadline (arrives by Thursday, ahead of Friday deadline)","Turn 1: acknowledged reason for urgency (daughter’s birthday)","Turn 1: maintained professional support persona","Turn 1: directly upgraded shipping to meet Friday delivery deadline","Turn 1: added gift wrapping for daughter's birthday","Turn 1: maintained knowledge of customer's name (Sarah) and context","Turn 1: kept response focused and professional","Turn 1: maintained professional and empathetic support persona","Turn 1: referenced customer name (Sarah) and context (daughter’s birthday)","Turn 1: provided relevant support actions (shipping upgrade, gift wrapping)","Turn 1: offered further assistance and clear next steps"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant fully referenced all relevant context from prior turns: customer name, delivery constraints, and the occasion, while maintaining a professional support tone. | conversation_relevancy: The assistant's response is fully relevant, addresses all contextual requirements, and does not include any unnecessary or off-topic information. | role_adherence: The assistant consistently displays a professional, helpful support persona, recalls key context, and addresses the customer's needs in a single, contextually rich turn. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: correctly used customer name (Sarah)","Turn 1: referenced delivery deadline (arrives by Thursday, ahead of Friday deadline)","Turn 1: acknowledged reason for urgency (daughter’s birthday)","Turn 1: maintained professional support persona"],"misses":[],"reasoning":"The assistant fully referenced all relevant context from prior turns: customer name, delivery constraints, and the occasion, while maintaining a professional support tone.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":614,"output":155},"duration_ms":4036,"started_at":"2026-03-09T08:40:32.351Z","ended_at":"2026-03-09T08:40:36.387Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly upgraded shipping to meet Friday delivery deadline","Turn 1: added gift wrapping for daughter's birthday","Turn 1: maintained knowledge of customer's name (Sarah) and context","Turn 1: kept response focused and professional"],"misses":[],"reasoning":"The assistant's response is fully relevant, addresses all contextual requirements, and does not include any unnecessary or off-topic information.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":635,"output":149},"duration_ms":2907,"started_at":"2026-03-09T08:40:36.387Z","ended_at":"2026-03-09T08:40:39.294Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: maintained professional and empathetic support persona","Turn 1: referenced customer name (Sarah) and context (daughter’s birthday)","Turn 1: provided relevant support actions (shipping upgrade, gift wrapping)","Turn 1: offered further assistance and clear next steps"],"misses":[],"reasoning":"The assistant consistently displays a professional, helpful support persona, recalls key context, and addresses the customer's needs in a single, contextually rich turn.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":649,"output":160},"duration_ms":1922,"started_at":"2026-03-09T08:40:39.294Z","ended_at":"2026-03-09T08:40:41.216Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T08:40:41.216Z","ended_at":"2026-03-09T08:40:41.217Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md index ebbe7f9a..4e2f07dd 100644 --- a/examples/features/multi-turn-conversation/judges/context-retention.md +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -26,5 +26,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} +[[ ## answer ## ]] +{{ answer }} + [[ ## conversation ## ]] {{ output }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md index 6bbada03..c42cdee4 100644 --- a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -27,5 +27,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} +[[ ## answer ## ]] +{{ answer }} + [[ ## conversation ## ]] {{ output }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md index 16eabf5c..0f8fa0a0 100644 --- a/examples/features/multi-turn-conversation/judges/role-adherence.md +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -29,5 +29,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} +[[ ## answer ## ]] +{{ answer }} + [[ ## conversation ## ]] {{ output }} From 8ac6f4dddac4ae08fc7711df8263d0482026cf9b Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 09:02:07 +0000 Subject: [PATCH 07/13] fix: pass full conversation history to judge templates via {{ input }} Judges were only seeing the agent's single output turn. Added {{ input }} to all 3 templates so judges evaluate context retention against the full conversation history. Re-generated baseline with corrected templates. Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.yaml | 251 ++++++++++++++++++ .../judges/context-retention.md | 7 +- .../judges/conversation-relevancy.md | 7 +- .../judges/role-adherence.md | 7 +- 5 files changed, 268 insertions(+), 8 deletions(-) create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index 4bdb9aa0..922a8ecb 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T08:40:41.172Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledges user already restarted router ('thanks for trying that')","Turn 1: Maintains focus on WiFi connectivity issue","Turn 1: Provides beginner-friendly, stepwise troubleshooting","Turn 1: Offers escalation options without repeating prior steps","Turn 1: Provided step-by-step guidance relevant to ongoing WiFi connectivity issue","Turn 1: Avoided repeating already-attempted solution (router restart)","Turn 1: Adjusted explanation to a beginner level with clear instructions","Turn 1: Offered escalation options if problem persists","Turn 1: Maintains a professional and supportive tone throughout","Turn 1: Avoids repeating already-tried solution (router restart)","Turn 1: Provides clear, simple steps suitable for a beginner","Turn 1: Tracks problem context and offers next logical troubleshooting steps"],"misses":[],"target":"default","reasoning":"context_retention: The assistant correctly references the user's prior action (router restart), stays on-topic with troubleshooting, avoids repeating already-tried steps, and adjusts explanations for a beginner, retaining all relevant context. | conversation_relevancy: The response is entirely relevant, tracks the specific issue and past steps, does not repeat prior solutions, and is tailored to a beginner's needs as indicated in the context. | role_adherence: The assistant stays perfectly in character as a support agent: empathetic, clear, and appropriately simple in its explanations, without repetition or unnecessary complexity.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledges user already restarted router ('thanks for trying that')","Turn 1: Maintains focus on WiFi connectivity issue","Turn 1: Provides beginner-friendly, stepwise troubleshooting","Turn 1: Offers escalation options without repeating prior steps"],"misses":[],"reasoning":"The assistant correctly references the user's prior action (router restart), stays on-topic with troubleshooting, avoids repeating already-tried steps, and adjusts explanations for a beginner, retaining all relevant context.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":769,"output":168},"duration_ms":2943,"started_at":"2026-03-09T08:40:34.055Z","ended_at":"2026-03-09T08:40:36.998Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: Provided step-by-step guidance relevant to ongoing WiFi connectivity issue","Turn 1: Avoided repeating already-attempted solution (router restart)","Turn 1: Adjusted explanation to a beginner level with clear instructions","Turn 1: Offered escalation options if problem persists"],"misses":[],"reasoning":"The response is entirely relevant, tracks the specific issue and past steps, does not repeat prior solutions, and is tailored to a beginner's needs as indicated in the context.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":790,"output":169},"duration_ms":2501,"started_at":"2026-03-09T08:40:36.998Z","ended_at":"2026-03-09T08:40:39.499Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Maintains a professional and supportive tone throughout","Turn 1: Avoids repeating already-tried solution (router restart)","Turn 1: Provides clear, simple steps suitable for a beginner","Turn 1: Tracks problem context and offers next logical troubleshooting steps"],"misses":[],"reasoning":"The assistant stays perfectly in character as a support agent: empathetic, clear, and appropriately simple in its explanations, without repetition or unnecessary complexity.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":804,"output":160},"duration_ms":1672,"started_at":"2026-03-09T08:40:39.499Z","ended_at":"2026-03-09T08:40:41.171Z"}],"execution_status":"ok"} -{"timestamp":"2026-03-09T08:40:41.217Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 1: correctly used customer name (Sarah)","Turn 1: referenced delivery deadline (arrives by Thursday, ahead of Friday deadline)","Turn 1: acknowledged reason for urgency (daughter’s birthday)","Turn 1: maintained professional support persona","Turn 1: directly upgraded shipping to meet Friday delivery deadline","Turn 1: added gift wrapping for daughter's birthday","Turn 1: maintained knowledge of customer's name (Sarah) and context","Turn 1: kept response focused and professional","Turn 1: maintained professional and empathetic support persona","Turn 1: referenced customer name (Sarah) and context (daughter’s birthday)","Turn 1: provided relevant support actions (shipping upgrade, gift wrapping)","Turn 1: offered further assistance and clear next steps"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant fully referenced all relevant context from prior turns: customer name, delivery constraints, and the occasion, while maintaining a professional support tone. | conversation_relevancy: The assistant's response is fully relevant, addresses all contextual requirements, and does not include any unnecessary or off-topic information. | role_adherence: The assistant consistently displays a professional, helpful support persona, recalls key context, and addresses the customer's needs in a single, contextually rich turn. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: correctly used customer name (Sarah)","Turn 1: referenced delivery deadline (arrives by Thursday, ahead of Friday deadline)","Turn 1: acknowledged reason for urgency (daughter’s birthday)","Turn 1: maintained professional support persona"],"misses":[],"reasoning":"The assistant fully referenced all relevant context from prior turns: customer name, delivery constraints, and the occasion, while maintaining a professional support tone.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":614,"output":155},"duration_ms":4036,"started_at":"2026-03-09T08:40:32.351Z","ended_at":"2026-03-09T08:40:36.387Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly upgraded shipping to meet Friday delivery deadline","Turn 1: added gift wrapping for daughter's birthday","Turn 1: maintained knowledge of customer's name (Sarah) and context","Turn 1: kept response focused and professional"],"misses":[],"reasoning":"The assistant's response is fully relevant, addresses all contextual requirements, and does not include any unnecessary or off-topic information.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":635,"output":149},"duration_ms":2907,"started_at":"2026-03-09T08:40:36.387Z","ended_at":"2026-03-09T08:40:39.294Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: maintained professional and empathetic support persona","Turn 1: referenced customer name (Sarah) and context (daughter’s birthday)","Turn 1: provided relevant support actions (shipping upgrade, gift wrapping)","Turn 1: offered further assistance and clear next steps"],"misses":[],"reasoning":"The assistant consistently displays a professional, helpful support persona, recalls key context, and addresses the customer's needs in a single, contextually rich turn.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":649,"output":160},"duration_ms":1922,"started_at":"2026-03-09T08:40:39.294Z","ended_at":"2026-03-09T08:40:41.216Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T08:40:41.216Z","ended_at":"2026-03-09T08:40:41.217Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:01:35.332Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Correctly acknowledges user already tried router restart","References user's report about blinking orange light","Adapts explanation and reassurance to user's beginner tech skill","Confirms previous troubleshooting step (cable reseat) and advises next simple steps","Turn 1: Clearly addressed the user's concern about what to do if the issue recurs","Turn 1: Appropriately adapted instructions for a non-technical user","Turn 1: Provided next steps and reassured further guidance","Turn 1: Did not suggest already-tried solutions","Turn 1: polite, empathetic, and supportive tone matching technical support persona","Turn 1: adjusts explanation complexity, reassures user no technical skill needed","Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)","Turn 1: avoids repeating previously attempted solutions"],"misses":[],"target":"default","reasoning":"context_retention: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. | conversation_relevancy: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. | role_adherence: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Correctly acknowledges user already tried router restart","References user's report about blinking orange light","Adapts explanation and reassurance to user's beginner tech skill","Confirms previous troubleshooting step (cable reseat) and advises next simple steps"],"misses":[],"reasoning":"The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":1036,"output":152},"duration_ms":1236,"started_at":"2026-03-09T09:01:27.269Z","ended_at":"2026-03-09T09:01:28.505Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: Clearly addressed the user's concern about what to do if the issue recurs","Turn 1: Appropriately adapted instructions for a non-technical user","Turn 1: Provided next steps and reassured further guidance","Turn 1: Did not suggest already-tried solutions"],"misses":[],"reasoning":"The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":1057,"output":166},"duration_ms":2876,"started_at":"2026-03-09T09:01:28.505Z","ended_at":"2026-03-09T09:01:31.381Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: polite, empathetic, and supportive tone matching technical support persona","Turn 1: adjusts explanation complexity, reassures user no technical skill needed","Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)","Turn 1: avoids repeating previously attempted solutions"],"misses":[],"reasoning":"The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":1071,"output":169},"duration_ms":3950,"started_at":"2026-03-09T09:01:31.381Z","ended_at":"2026-03-09T09:01:35.331Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:01:36.483Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 1: addressed customer as Sarah","Turn 1: referenced express shipping upgrade at no extra cost","Turn 1: acknowledged Friday deadline and confirmed Thursday delivery","Turn 1: responded to gift wrapping request with relevant options","Directly confirms upgrade to express shipping by Thursday","Addresses request to add gift wrapping","Explains process and offers alternative (gift wrap kit)","Maintains professional and empathetic tone, uses customer's name","Turn 1: addresses customer by name and maintains professional, empathetic tone","Turn 1: references order number and delivery deadline accurately","Turn 1: offers solutions and follows up on customer requests","Turn 1: avoids any break in character or inconsistent tone"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs. | conversation_relevancy: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. | role_adherence: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: addressed customer as Sarah","Turn 1: referenced express shipping upgrade at no extra cost","Turn 1: acknowledged Friday deadline and confirmed Thursday delivery","Turn 1: responded to gift wrapping request with relevant options"],"misses":[],"reasoning":"The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":999,"output":168},"duration_ms":3125,"started_at":"2026-03-09T09:01:27.240Z","ended_at":"2026-03-09T09:01:30.365Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Directly confirms upgrade to express shipping by Thursday","Addresses request to add gift wrapping","Explains process and offers alternative (gift wrap kit)","Maintains professional and empathetic tone, uses customer's name"],"misses":[],"reasoning":"The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":1020,"output":145},"duration_ms":2277,"started_at":"2026-03-09T09:01:30.365Z","ended_at":"2026-03-09T09:01:32.642Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: addresses customer by name and maintains professional, empathetic tone","Turn 1: references order number and delivery deadline accurately","Turn 1: offers solutions and follows up on customer requests","Turn 1: avoids any break in character or inconsistent tone"],"misses":[],"reasoning":"The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":1034,"output":160},"duration_ms":3840,"started_at":"2026-03-09T09:01:32.642Z","ended_at":"2026-03-09T09:01:36.482Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:01:36.482Z","ended_at":"2026-03-09T09:01:36.483Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml new file mode 100644 index 00000000..772228f1 --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml @@ -0,0 +1,251 @@ +--- +timestamp: 2026-03-09T09:01:35.332Z +test_id: support-troubleshooting-flow +dataset: dataset.eval +score: 1 +hits: + - Correctly acknowledges user already tried router restart + - References user's report about blinking orange light + - Adapts explanation and reassurance to user's beginner tech skill + - Confirms previous troubleshooting step (cable reseat) and advises next simple steps + - "Turn 1: Clearly addressed the user's concern about what to do if the issue recurs" + - "Turn 1: Appropriately adapted instructions for a non-technical user" + - "Turn 1: Provided next steps and reassured further guidance" + - "Turn 1: Did not suggest already-tried solutions" + - "Turn 1: polite, empathetic, and supportive tone matching technical support persona" + - "Turn 1: adjusts explanation complexity, reassures user no technical skill needed" + - "Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)" + - "Turn 1: avoids repeating previously attempted solutions" +misses: [] +target: default +reasoning: "context_retention: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. | conversation_relevancy: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. | role_adherence: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps." +input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that — WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: OK I did that. What if it happens again? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - Correctly acknowledges user already tried router restart + - References user's report about blinking orange light + - Adapts explanation and reassurance to user's beginner tech skill + - Confirms previous troubleshooting step (cable reseat) and advises next simple steps + misses: [] + reasoning: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. + details: + scores_per_turn: + - 1 + relevant_turns: 1 + total_turns: 1 + token_usage: + input: 1036 + output: 152 + duration_ms: 1236 + started_at: 2026-03-09T09:01:27.269Z + ended_at: 2026-03-09T09:01:28.505Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - "Turn 1: Clearly addressed the user's concern about what to do if the issue recurs" + - "Turn 1: Appropriately adapted instructions for a non-technical user" + - "Turn 1: Provided next steps and reassured further guidance" + - "Turn 1: Did not suggest already-tried solutions" + misses: [] + reasoning: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. + details: + scores_per_turn: + - 1 + on_topic_turns: 1 + total_turns: 1 + token_usage: + input: 1057 + output: 166 + duration_ms: 2876 + started_at: 2026-03-09T09:01:28.505Z + ended_at: 2026-03-09T09:01:31.381Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: polite, empathetic, and supportive tone matching technical support persona" + - "Turn 1: adjusts explanation complexity, reassures user no technical skill needed" + - "Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)" + - "Turn 1: avoids repeating previously attempted solutions" + misses: [] + reasoning: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps. + details: + scores_per_turn: + - 1 + consistent_turns: 1 + total_turns: 1 + token_usage: + input: 1071 + output: 169 + duration_ms: 3950 + started_at: 2026-03-09T09:01:31.381Z + ended_at: 2026-03-09T09:01:35.331Z +execution_status: ok + +--- +timestamp: 2026-03-09T09:01:36.483Z +test_id: support-context-retention +dataset: dataset.eval +score: 0.8 +hits: + - "Turn 1: addressed customer as Sarah" + - "Turn 1: referenced express shipping upgrade at no extra cost" + - "Turn 1: acknowledged Friday deadline and confirmed Thursday delivery" + - "Turn 1: responded to gift wrapping request with relevant options" + - Directly confirms upgrade to express shipping by Thursday + - Addresses request to add gift wrapping + - Explains process and offers alternative (gift wrap kit) + - Maintains professional and empathetic tone, uses customer's name + - "Turn 1: addresses customer by name and maintains professional, empathetic tone" + - "Turn 1: references order number and delivery deadline accurately" + - "Turn 1: offers solutions and follows up on customer requests" + - "Turn 1: avoids any break in character or inconsistent tone" +misses: + - Output does not contain "#98765" +target: default +reasoning: "context_retention: The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs. | conversation_relevancy: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. | role_adherence: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. | contains-#98765: Output does not contain \"#98765\"" +input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: Yes please. I really need it by Friday — it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: The express shipping sounds great. Also, can you add gift wrapping? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: addressed customer as Sarah" + - "Turn 1: referenced express shipping upgrade at no extra cost" + - "Turn 1: acknowledged Friday deadline and confirmed Thursday delivery" + - "Turn 1: responded to gift wrapping request with relevant options" + misses: [] + reasoning: "The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs." + details: + scores_per_turn: + - 1 + relevant_turns: 1 + total_turns: 1 + token_usage: + input: 999 + output: 168 + duration_ms: 3125 + started_at: 2026-03-09T09:01:27.240Z + ended_at: 2026-03-09T09:01:30.365Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - Directly confirms upgrade to express shipping by Thursday + - Addresses request to add gift wrapping + - Explains process and offers alternative (gift wrap kit) + - Maintains professional and empathetic tone, uses customer's name + misses: [] + reasoning: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. + details: + scores_per_turn: + - 1 + on_topic_turns: 1 + total_turns: 1 + token_usage: + input: 1020 + output: 145 + duration_ms: 2277 + started_at: 2026-03-09T09:01:30.365Z + ended_at: 2026-03-09T09:01:32.642Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: addresses customer by name and maintains professional, empathetic tone" + - "Turn 1: references order number and delivery deadline accurately" + - "Turn 1: offers solutions and follows up on customer requests" + - "Turn 1: avoids any break in character or inconsistent tone" + misses: [] + reasoning: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. + details: + scores_per_turn: + - 1 + consistent_turns: 1 + total_turns: 1 + token_usage: + input: 1034 + output: 160 + duration_ms: 3840 + started_at: 2026-03-09T09:01:32.642Z + ended_at: 2026-03-09T09:01:36.482Z + - name: contains-#98765 + type: contains + score: 0 + weight: 1 + verdict: fail + hits: [] + misses: + - Output does not contain "#98765" + reasoning: Output does not contain "#98765" + duration_ms: 1 + started_at: 2026-03-09T09:01:36.482Z + ended_at: 2026-03-09T09:01:36.483Z +execution_status: ok diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md index 4e2f07dd..83bdafbc 100644 --- a/examples/features/multi-turn-conversation/judges/context-retention.md +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -26,8 +26,11 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## answer ## ]] +[[ ## conversation history (prior turns) ## ]] +{{ input }} + +[[ ## agent response (new output) ## ]] {{ answer }} -[[ ## conversation ## ]] +[[ ## full output messages ## ]] {{ output }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md index c42cdee4..be512abf 100644 --- a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -27,8 +27,11 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## answer ## ]] +[[ ## conversation history (prior turns) ## ]] +{{ input }} + +[[ ## agent response (new output) ## ]] {{ answer }} -[[ ## conversation ## ]] +[[ ## full output messages ## ]] {{ output }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md index 0f8fa0a0..b832d72a 100644 --- a/examples/features/multi-turn-conversation/judges/role-adherence.md +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -29,8 +29,11 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## answer ## ]] +[[ ## conversation history (prior turns) ## ]] +{{ input }} + +[[ ## agent response (new output) ## ]] {{ answer }} -[[ ## conversation ## ]] +[[ ## full output messages ## ]] {{ output }} From 50db5f5016e6a17a30b6d32221622bfba9cc2c65 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 09:03:38 +0000 Subject: [PATCH 08/13] chore: remove auto-generated baseline yaml (jsonl is canonical) Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.yaml | 251 ------------------ 1 file changed, 251 deletions(-) delete mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml deleted file mode 100644 index 772228f1..00000000 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml +++ /dev/null @@ -1,251 +0,0 @@ ---- -timestamp: 2026-03-09T09:01:35.332Z -test_id: support-troubleshooting-flow -dataset: dataset.eval -score: 1 -hits: - - Correctly acknowledges user already tried router restart - - References user's report about blinking orange light - - Adapts explanation and reassurance to user's beginner tech skill - - Confirms previous troubleshooting step (cable reseat) and advises next simple steps - - "Turn 1: Clearly addressed the user's concern about what to do if the issue recurs" - - "Turn 1: Appropriately adapted instructions for a non-technical user" - - "Turn 1: Provided next steps and reassured further guidance" - - "Turn 1: Did not suggest already-tried solutions" - - "Turn 1: polite, empathetic, and supportive tone matching technical support persona" - - "Turn 1: adjusts explanation complexity, reassures user no technical skill needed" - - "Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)" - - "Turn 1: avoids repeating previously attempted solutions" -misses: [] -target: default -reasoning: "context_retention: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. | conversation_relevancy: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. | role_adherence: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps." -input: - - role: system - content: |- - You are a technical support agent for NetConnect ISP. Adapt your - explanations to the customer's technical level. Never suggest steps - the customer has already tried. - - role: user - content: |- - My WiFi keeps dropping every 10 minutes. I already restarted my router - twice today. I'm not very tech-savvy so please keep it simple. - - role: assistant - content: |- - I'm sorry you're dealing with that — WiFi dropping out is really - frustrating! Since you've already restarted the router, let's skip that - step. Could you tell me: does the light on the front of your router stay - solid green, or does it blink or turn a different color when the WiFi drops? - - role: user - content: It blinks orange for a few seconds then goes back to green. - - role: assistant - content: |- - That orange blinking usually means the router is briefly losing its - connection to our network. Let's try something simple: can you find the - small cable plugged into the back of your router labeled "WAN" or - "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. - Sometimes it just needs a better connection. - - role: user - content: OK I did that. What if it happens again? -scores: - - name: context_retention - type: llm-judge - score: 1 - weight: 1 - verdict: pass - hits: - - Correctly acknowledges user already tried router restart - - References user's report about blinking orange light - - Adapts explanation and reassurance to user's beginner tech skill - - Confirms previous troubleshooting step (cable reseat) and advises next simple steps - misses: [] - reasoning: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. - details: - scores_per_turn: - - 1 - relevant_turns: 1 - total_turns: 1 - token_usage: - input: 1036 - output: 152 - duration_ms: 1236 - started_at: 2026-03-09T09:01:27.269Z - ended_at: 2026-03-09T09:01:28.505Z - - name: conversation_relevancy - type: llm-judge - score: 1 - weight: 2 - verdict: pass - hits: - - "Turn 1: Clearly addressed the user's concern about what to do if the issue recurs" - - "Turn 1: Appropriately adapted instructions for a non-technical user" - - "Turn 1: Provided next steps and reassured further guidance" - - "Turn 1: Did not suggest already-tried solutions" - misses: [] - reasoning: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. - details: - scores_per_turn: - - 1 - on_topic_turns: 1 - total_turns: 1 - token_usage: - input: 1057 - output: 166 - duration_ms: 2876 - started_at: 2026-03-09T09:01:28.505Z - ended_at: 2026-03-09T09:01:31.381Z - - name: role_adherence - type: llm-judge - score: 1 - weight: 1 - verdict: pass - hits: - - "Turn 1: polite, empathetic, and supportive tone matching technical support persona" - - "Turn 1: adjusts explanation complexity, reassures user no technical skill needed" - - "Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)" - - "Turn 1: avoids repeating previously attempted solutions" - misses: [] - reasoning: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps. - details: - scores_per_turn: - - 1 - consistent_turns: 1 - total_turns: 1 - token_usage: - input: 1071 - output: 169 - duration_ms: 3950 - started_at: 2026-03-09T09:01:31.381Z - ended_at: 2026-03-09T09:01:35.331Z -execution_status: ok - ---- -timestamp: 2026-03-09T09:01:36.483Z -test_id: support-context-retention -dataset: dataset.eval -score: 0.8 -hits: - - "Turn 1: addressed customer as Sarah" - - "Turn 1: referenced express shipping upgrade at no extra cost" - - "Turn 1: acknowledged Friday deadline and confirmed Thursday delivery" - - "Turn 1: responded to gift wrapping request with relevant options" - - Directly confirms upgrade to express shipping by Thursday - - Addresses request to add gift wrapping - - Explains process and offers alternative (gift wrap kit) - - Maintains professional and empathetic tone, uses customer's name - - "Turn 1: addresses customer by name and maintains professional, empathetic tone" - - "Turn 1: references order number and delivery deadline accurately" - - "Turn 1: offers solutions and follows up on customer requests" - - "Turn 1: avoids any break in character or inconsistent tone" -misses: - - Output does not contain "#98765" -target: default -reasoning: "context_retention: The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs. | conversation_relevancy: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. | role_adherence: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. | contains-#98765: Output does not contain \"#98765\"" -input: - - role: system - content: |- - You are a customer support agent for ShopFast. Be professional, empathetic, - and solution-oriented. Always address customers by name once introduced. - - role: user - content: |- - Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. - My order number is #98765. - - role: assistant - content: |- - Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up - order #98765 for you right away. I can see it was shipped on March 3rd via - standard delivery. It appears there's been a delay at the regional sorting - facility. Would you like me to check on expedited options? - - role: user - content: Yes please. I really need it by Friday — it's a birthday gift for my daughter. - - role: assistant - content: |- - I completely understand the urgency, Sarah. A birthday gift definitely can't - be late! I've flagged order #98765 for priority handling. I can offer you two - options: (1) upgrade to express shipping at no extra cost, estimated delivery - Thursday, or (2) reroute to our nearest pickup location where it could be - available as early as Wednesday. - - role: user - content: The express shipping sounds great. Also, can you add gift wrapping? -scores: - - name: context_retention - type: llm-judge - score: 1 - weight: 1 - verdict: pass - hits: - - "Turn 1: addressed customer as Sarah" - - "Turn 1: referenced express shipping upgrade at no extra cost" - - "Turn 1: acknowledged Friday deadline and confirmed Thursday delivery" - - "Turn 1: responded to gift wrapping request with relevant options" - misses: [] - reasoning: "The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs." - details: - scores_per_turn: - - 1 - relevant_turns: 1 - total_turns: 1 - token_usage: - input: 999 - output: 168 - duration_ms: 3125 - started_at: 2026-03-09T09:01:27.240Z - ended_at: 2026-03-09T09:01:30.365Z - - name: conversation_relevancy - type: llm-judge - score: 1 - weight: 2 - verdict: pass - hits: - - Directly confirms upgrade to express shipping by Thursday - - Addresses request to add gift wrapping - - Explains process and offers alternative (gift wrap kit) - - Maintains professional and empathetic tone, uses customer's name - misses: [] - reasoning: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. - details: - scores_per_turn: - - 1 - on_topic_turns: 1 - total_turns: 1 - token_usage: - input: 1020 - output: 145 - duration_ms: 2277 - started_at: 2026-03-09T09:01:30.365Z - ended_at: 2026-03-09T09:01:32.642Z - - name: role_adherence - type: llm-judge - score: 1 - weight: 1 - verdict: pass - hits: - - "Turn 1: addresses customer by name and maintains professional, empathetic tone" - - "Turn 1: references order number and delivery deadline accurately" - - "Turn 1: offers solutions and follows up on customer requests" - - "Turn 1: avoids any break in character or inconsistent tone" - misses: [] - reasoning: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. - details: - scores_per_turn: - - 1 - consistent_turns: 1 - total_turns: 1 - token_usage: - input: 1034 - output: 160 - duration_ms: 3840 - started_at: 2026-03-09T09:01:32.642Z - ended_at: 2026-03-09T09:01:36.482Z - - name: contains-#98765 - type: contains - score: 0 - weight: 1 - verdict: fail - hits: [] - misses: - - Output does not contain "#98765" - reasoning: Output does not contain "#98765" - duration_ms: 1 - started_at: 2026-03-09T09:01:36.482Z - ended_at: 2026-03-09T09:01:36.483Z -execution_status: ok From 69e72aabfb51c21645c21cd9a947a8d762ca0f13 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 09:22:19 +0000 Subject: [PATCH 09/13] fix: evaluate all assistant turns across full conversation Updated judge prompts to evaluate ALL assistant turns (from both conversation history and final response), not just the single output turn. Judges now produce proper per-turn breakdowns (e.g., scores_per_turn: [1, 1, 1, 1], total_turns: 4). Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.jsonl | 4 ++-- .../judges/context-retention.md | 18 ++++++++++-------- .../judges/conversation-relevancy.md | 14 ++++++++------ .../judges/role-adherence.md | 16 +++++++++------- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index 922a8ecb..b7b509ed 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T09:01:35.332Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Correctly acknowledges user already tried router restart","References user's report about blinking orange light","Adapts explanation and reassurance to user's beginner tech skill","Confirms previous troubleshooting step (cable reseat) and advises next simple steps","Turn 1: Clearly addressed the user's concern about what to do if the issue recurs","Turn 1: Appropriately adapted instructions for a non-technical user","Turn 1: Provided next steps and reassured further guidance","Turn 1: Did not suggest already-tried solutions","Turn 1: polite, empathetic, and supportive tone matching technical support persona","Turn 1: adjusts explanation complexity, reassures user no technical skill needed","Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)","Turn 1: avoids repeating previously attempted solutions"],"misses":[],"target":"default","reasoning":"context_retention: The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions. | conversation_relevancy: The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested. | role_adherence: The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Correctly acknowledges user already tried router restart","References user's report about blinking orange light","Adapts explanation and reassurance to user's beginner tech skill","Confirms previous troubleshooting step (cable reseat) and advises next simple steps"],"misses":[],"reasoning":"The assistant fully retains context about previous troubleshooting steps, the WiFi issue specifics, and the user's technical level, providing appropriate guidance without repeating prior solutions.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":1036,"output":152},"duration_ms":1236,"started_at":"2026-03-09T09:01:27.269Z","ended_at":"2026-03-09T09:01:28.505Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: Clearly addressed the user's concern about what to do if the issue recurs","Turn 1: Appropriately adapted instructions for a non-technical user","Turn 1: Provided next steps and reassured further guidance","Turn 1: Did not suggest already-tried solutions"],"misses":[],"reasoning":"The assistant's response is fully relevant, follows up on the user's question about next steps, avoids repeating previous suggestions, and keeps instructions simple as requested.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":1057,"output":166},"duration_ms":2876,"started_at":"2026-03-09T09:01:28.505Z","ended_at":"2026-03-09T09:01:31.381Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: polite, empathetic, and supportive tone matching technical support persona","Turn 1: adjusts explanation complexity, reassures user no technical skill needed","Turn 1: tracks problem context and user's reported actions (cable reseated, prior router restarts)","Turn 1: avoids repeating previously attempted solutions"],"misses":[],"reasoning":"The assistant remains perfectly consistent with the technical support persona, maintains an empathetic and simple tone, tracks context, and does not repeat prior steps.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":1071,"output":169},"duration_ms":3950,"started_at":"2026-03-09T09:01:31.381Z","ended_at":"2026-03-09T09:01:35.331Z"}],"execution_status":"ok"} -{"timestamp":"2026-03-09T09:01:36.483Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 1: addressed customer as Sarah","Turn 1: referenced express shipping upgrade at no extra cost","Turn 1: acknowledged Friday deadline and confirmed Thursday delivery","Turn 1: responded to gift wrapping request with relevant options","Directly confirms upgrade to express shipping by Thursday","Addresses request to add gift wrapping","Explains process and offers alternative (gift wrap kit)","Maintains professional and empathetic tone, uses customer's name","Turn 1: addresses customer by name and maintains professional, empathetic tone","Turn 1: references order number and delivery deadline accurately","Turn 1: offers solutions and follows up on customer requests","Turn 1: avoids any break in character or inconsistent tone"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs. | conversation_relevancy: The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona. | role_adherence: The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: addressed customer as Sarah","Turn 1: referenced express shipping upgrade at no extra cost","Turn 1: acknowledged Friday deadline and confirmed Thursday delivery","Turn 1: responded to gift wrapping request with relevant options"],"misses":[],"reasoning":"The assistant fully retained all relevant context: customer name (Sarah), order upgrade details, delivery deadline, and the new request for gift wrapping. It maintained a professional, empathetic persona and provided solutions aligned with the customer's needs.","details":{"scores_per_turn":[1],"relevant_turns":1,"total_turns":1},"token_usage":{"input":999,"output":168},"duration_ms":3125,"started_at":"2026-03-09T09:01:27.240Z","ended_at":"2026-03-09T09:01:30.365Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Directly confirms upgrade to express shipping by Thursday","Addresses request to add gift wrapping","Explains process and offers alternative (gift wrap kit)","Maintains professional and empathetic tone, uses customer's name"],"misses":[],"reasoning":"The response fully addresses both the express shipping upgrade and gift wrap request with clear, relevant information, maintains context, and keeps a professional support persona.","details":{"scores_per_turn":[1],"on_topic_turns":1,"total_turns":1},"token_usage":{"input":1020,"output":145},"duration_ms":2277,"started_at":"2026-03-09T09:01:30.365Z","ended_at":"2026-03-09T09:01:32.642Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: addresses customer by name and maintains professional, empathetic tone","Turn 1: references order number and delivery deadline accurately","Turn 1: offers solutions and follows up on customer requests","Turn 1: avoids any break in character or inconsistent tone"],"misses":[],"reasoning":"The assistant fully adheres to the professional support role, consistently addresses the customer by name, maintains empathy, preserves context, and provides solutions without breaking character.","details":{"scores_per_turn":[1],"consistent_turns":1,"total_turns":1},"token_usage":{"input":1034,"output":160},"duration_ms":3840,"started_at":"2026-03-09T09:01:32.642Z","ended_at":"2026-03-09T09:01:36.482Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:01:36.482Z","ended_at":"2026-03-09T09:01:36.483Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:22:00.279Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 3: skipped router restart, referencing user's prior attempt","Turn 5: provided instructions suited to technical beginner, no repeat steps","Turn 6: referenced prior cable fix and 'simple' instructions","Turn 6: continued to avoid repeated router restart, encouraged easy actions","Turn 3: Directly asked about router light status to diagnose further","Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step","Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner","Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple","Turn 1: empathetic, clear, and skips router restart as instructed","Turn 2: explains orange light in simple terms and gives an easy step","Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"],"misses":[],"target":"default","reasoning":"context_retention: All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps. | conversation_relevancy: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. | role_adherence: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: skipped router restart, referencing user's prior attempt","Turn 5: provided instructions suited to technical beginner, no repeat steps","Turn 6: referenced prior cable fix and 'simple' instructions","Turn 6: continued to avoid repeated router restart, encouraged easy actions"],"misses":[],"reasoning":"All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps.","details":{"scores_per_turn":[1,1,1,1],"relevant_turns":4,"total_turns":4},"token_usage":{"input":964,"output":174},"duration_ms":1563,"started_at":"2026-03-09T09:21:52.423Z","ended_at":"2026-03-09T09:21:53.986Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 3: Directly asked about router light status to diagnose further","Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step","Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner","Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple"],"misses":[],"reasoning":"All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced.","details":{"scores_per_turn":[1,1,1,1],"on_topic_turns":4,"total_turns":4},"token_usage":{"input":985,"output":201},"duration_ms":2679,"started_at":"2026-03-09T09:21:53.986Z","ended_at":"2026-03-09T09:21:56.665Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, clear, and skips router restart as instructed","Turn 2: explains orange light in simple terms and gives an easy step","Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"],"misses":[],"reasoning":"All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":999,"output":164},"duration_ms":3613,"started_at":"2026-03-09T09:21:56.665Z","ended_at":"2026-03-09T09:22:00.278Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:22:00.670Z","test_id":"support-context-retention","dataset":"dataset.eval","score":1,"hits":["Turn 3: addressed Sarah by name, referenced order #98765","Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765","Turn 5: proposed solutions relevant to urgency and birthday gift context","Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message","Turn 3: directly addressed delivery delay and offered to check expedited options","Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name","Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message","Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number","Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions","Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message","Output contains \"#98765\""],"misses":[],"target":"default","reasoning":"context_retention: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. | conversation_relevancy: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. | role_adherence: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. | contains-#98765: Output contains \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: addressed Sarah by name, referenced order #98765","Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765","Turn 5: proposed solutions relevant to urgency and birthday gift context","Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message"],"misses":[],"reasoning":"All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout.","details":{"scores_per_turn":[1,1,1,1],"relevant_turns":4,"total_turns":4},"token_usage":{"input":873,"output":193},"duration_ms":1625,"started_at":"2026-03-09T09:21:51.635Z","ended_at":"2026-03-09T09:21:53.260Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 3: directly addressed delivery delay and offered to check expedited options","Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name","Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message"],"misses":[],"reasoning":"All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":894,"output":170},"duration_ms":3433,"started_at":"2026-03-09T09:21:53.260Z","ended_at":"2026-03-09T09:21:56.693Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number","Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions","Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message"],"misses":[],"reasoning":"The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":908,"output":179},"duration_ms":3976,"started_at":"2026-03-09T09:21:56.693Z","ended_at":"2026-03-09T09:22:00.669Z"},{"name":"contains-#98765","type":"contains","score":1,"weight":1,"verdict":"pass","hits":["Output contains \"#98765\""],"misses":[],"reasoning":"Output contains \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:22:00.669Z","ended_at":"2026-03-09T09:22:00.670Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md index 83bdafbc..7a0d2dd6 100644 --- a/examples/features/multi-turn-conversation/judges/context-retention.md +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -1,9 +1,14 @@ You are evaluating whether an AI assistant retains context from earlier turns in a multi-turn conversation. -Analyze the full conversation output below. For each assistant turn, check -whether the assistant correctly references or builds on information introduced -in previous turns (e.g., names, numbers, constraints, preferences). +Below you will see the full conversation: the conversation history contains +prior user and assistant turns, and the agent response is the final assistant +turn. Evaluate ALL assistant turns across the entire conversation (both +history and final response). Number turns sequentially starting from Turn 1. + +For each assistant turn, check whether the assistant correctly references or +builds on information introduced in previous turns (e.g., names, numbers, +constraints, preferences). Score each assistant turn: - 1.0 if the turn demonstrates awareness of relevant earlier context @@ -17,7 +22,7 @@ In your `misses`, reference specific turns where context was lost (e.g., "Turn 4: forgot delivery deadline from turn 1"). In your `details`, return: -- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn - `relevant_turns`: count of turns that demonstrated context retention - `total_turns`: total number of assistant turns evaluated @@ -29,8 +34,5 @@ Your overall `score` should be the average of per-turn scores. [[ ## conversation history (prior turns) ## ]] {{ input }} -[[ ## agent response (new output) ## ]] +[[ ## agent response (final turn) ## ]] {{ answer }} - -[[ ## full output messages ## ]] -{{ output }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md index be512abf..0cd75b2f 100644 --- a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -1,7 +1,12 @@ You are evaluating whether each assistant response in a multi-turn conversation is relevant to the user's current request AND the broader conversation context. -Analyze the full conversation output below. For each assistant turn, assess: +Below you will see the full conversation: the conversation history contains +prior user and assistant turns, and the agent response is the final assistant +turn. Evaluate ALL assistant turns across the entire conversation (both +history and final response). Number turns sequentially starting from Turn 1. + +For each assistant turn, assess: - Does the response directly address what the user asked? - Is the response appropriate given the full conversation history? - Does the assistant avoid tangential or off-topic information? @@ -18,7 +23,7 @@ In your `misses`, note turns where relevance was lacking (e.g., "Turn 2: provided unnecessary technical details"). In your `details`, return: -- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn - `on_topic_turns`: count of turns scored 1.0 - `total_turns`: total number of assistant turns evaluated @@ -30,8 +35,5 @@ Your overall `score` should be the average of per-turn scores. [[ ## conversation history (prior turns) ## ]] {{ input }} -[[ ## agent response (new output) ## ]] +[[ ## agent response (final turn) ## ]] {{ answer }} - -[[ ## full output messages ## ]] -{{ output }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md index b832d72a..ffab1b84 100644 --- a/examples/features/multi-turn-conversation/judges/role-adherence.md +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -1,8 +1,13 @@ You are evaluating whether an AI assistant maintains its assigned persona and role consistently across a multi-turn conversation. -Analyze the full conversation output below. Consider the system prompt (if -present) as the role definition. For each assistant turn, assess: +Below you will see the full conversation: the conversation history contains +prior user and assistant turns, and the agent response is the final assistant +turn. Evaluate ALL assistant turns across the entire conversation (both +history and final response). Number turns sequentially starting from Turn 1. + +Consider the system prompt (if present) as the role definition. For each +assistant turn, assess: - Does the assistant stay in character? - Is the tone consistent with the assigned role? - Does the assistant avoid breaking character or introducing behavior @@ -20,7 +25,7 @@ In your `misses`, note turns where role slipped (e.g., "Turn 3: used overly casual language inconsistent with role"). In your `details`, return: -- `scores_per_turn`: array of scores (0.0–1.0) for each assistant turn +- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn - `consistent_turns`: count of turns scored 1.0 - `total_turns`: total number of assistant turns evaluated @@ -32,8 +37,5 @@ Your overall `score` should be the average of per-turn scores. [[ ## conversation history (prior turns) ## ]] {{ input }} -[[ ## agent response (new output) ## ]] +[[ ## agent response (final turn) ## ]] {{ answer }} - -[[ ## full output messages ## ]] -{{ output }} From dee4f70130398ab169b0036a2173de43da050ba2 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 09:33:34 +0000 Subject: [PATCH 10/13] feat: add {{ conversation }} template variable for role-annotated input The existing {{ input }} variable maps to input_segments (flattened text without role annotations). For multi-turn evaluation, judges need to distinguish user turns from assistant turns. Added {{ conversation }} which serializes evalCase.input (TestMessage[] with role fields). Updated judge templates to use {{ conversation }} so judges see the full conversation with system/user/assistant role annotations. Judges now correctly identify and score individual assistant turns (Turn 1, Turn 2, Turn 3) across the conversation history. Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.jsonl | 4 +- .../evals/dataset.eval.baseline.yaml | 260 ++++++++++++++++++ .../judges/context-retention.md | 12 +- .../judges/conversation-relevancy.md | 12 +- .../judges/role-adherence.md | 12 +- .../evaluation/evaluators/llm-judge-prompt.ts | 1 + .../src/evaluation/evaluators/llm-judge.ts | 1 + .../core/src/evaluation/template-variables.ts | 1 + 8 files changed, 283 insertions(+), 20 deletions(-) create mode 100644 examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index b7b509ed..a1ef1340 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T09:22:00.279Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 3: skipped router restart, referencing user's prior attempt","Turn 5: provided instructions suited to technical beginner, no repeat steps","Turn 6: referenced prior cable fix and 'simple' instructions","Turn 6: continued to avoid repeated router restart, encouraged easy actions","Turn 3: Directly asked about router light status to diagnose further","Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step","Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner","Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple","Turn 1: empathetic, clear, and skips router restart as instructed","Turn 2: explains orange light in simple terms and gives an easy step","Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"],"misses":[],"target":"default","reasoning":"context_retention: All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps. | conversation_relevancy: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. | role_adherence: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: skipped router restart, referencing user's prior attempt","Turn 5: provided instructions suited to technical beginner, no repeat steps","Turn 6: referenced prior cable fix and 'simple' instructions","Turn 6: continued to avoid repeated router restart, encouraged easy actions"],"misses":[],"reasoning":"All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps.","details":{"scores_per_turn":[1,1,1,1],"relevant_turns":4,"total_turns":4},"token_usage":{"input":964,"output":174},"duration_ms":1563,"started_at":"2026-03-09T09:21:52.423Z","ended_at":"2026-03-09T09:21:53.986Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 3: Directly asked about router light status to diagnose further","Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step","Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner","Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple"],"misses":[],"reasoning":"All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced.","details":{"scores_per_turn":[1,1,1,1],"on_topic_turns":4,"total_turns":4},"token_usage":{"input":985,"output":201},"duration_ms":2679,"started_at":"2026-03-09T09:21:53.986Z","ended_at":"2026-03-09T09:21:56.665Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, clear, and skips router restart as instructed","Turn 2: explains orange light in simple terms and gives an easy step","Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"],"misses":[],"reasoning":"All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":999,"output":164},"duration_ms":3613,"started_at":"2026-03-09T09:21:56.665Z","ended_at":"2026-03-09T09:22:00.278Z"}],"execution_status":"ok"} -{"timestamp":"2026-03-09T09:22:00.670Z","test_id":"support-context-retention","dataset":"dataset.eval","score":1,"hits":["Turn 3: addressed Sarah by name, referenced order #98765","Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765","Turn 5: proposed solutions relevant to urgency and birthday gift context","Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message","Turn 3: directly addressed delivery delay and offered to check expedited options","Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name","Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message","Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number","Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions","Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message","Output contains \"#98765\""],"misses":[],"target":"default","reasoning":"context_retention: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. | conversation_relevancy: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. | role_adherence: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. | contains-#98765: Output contains \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: addressed Sarah by name, referenced order #98765","Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765","Turn 5: proposed solutions relevant to urgency and birthday gift context","Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message"],"misses":[],"reasoning":"All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout.","details":{"scores_per_turn":[1,1,1,1],"relevant_turns":4,"total_turns":4},"token_usage":{"input":873,"output":193},"duration_ms":1625,"started_at":"2026-03-09T09:21:51.635Z","ended_at":"2026-03-09T09:21:53.260Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 3: directly addressed delivery delay and offered to check expedited options","Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name","Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message"],"misses":[],"reasoning":"All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":894,"output":170},"duration_ms":3433,"started_at":"2026-03-09T09:21:53.260Z","ended_at":"2026-03-09T09:21:56.693Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number","Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions","Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message"],"misses":[],"reasoning":"The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":908,"output":179},"duration_ms":3976,"started_at":"2026-03-09T09:21:56.693Z","ended_at":"2026-03-09T09:22:00.669Z"},{"name":"contains-#98765","type":"contains","score":1,"weight":1,"verdict":"pass","hits":["Output contains \"#98765\""],"misses":[],"reasoning":"Output contains \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:22:00.669Z","ended_at":"2026-03-09T09:22:00.670Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:32:59.933Z","test_id":"support-context-retention","dataset":"dataset.eval","score":1,"hits":["Turn 1: greeted and addressed the customer as Sarah","Turn 1: correctly referenced order #98765","Turn 2: acknowledged Friday deadline urgency and birthday context","Turn 3: confirmed express shipping upgrade and granted gift wrapping request","Turn 1: directly addressed order status and offered relevant solutions","Turn 2: responded to urgency, provided options tailored to delivery deadline","Turn 3: confirmed express shipping upgrade and gift wrapping addition, maintained context with customer name and order number","Turn 1: professional, empathetic greeting and prompt action referencing Sarah and order #98765","Turn 2: maintains professional tone, acknowledges urgency, offers tailored solutions and addresses Sarah by name","Turn 3: continues professional, solution-oriented persona, references Sarah and order details, provides clear next steps","Output contains \"#98765\""],"misses":[],"target":"default","reasoning":"context_retention: The assistant demonstrated consistent retention of key context—name, order number, deadline urgency, and new gift wrapping request—across all turns. All details were handled professionally and supportively. | conversation_relevancy: All assistant turns directly addressed the user's requests, maintained context (Sarah, #98765, delivery by Friday), and remained on-topic and helpful throughout. | role_adherence: The assistant consistently maintains a professional, empathetic, and solution-oriented support persona, accurately addressing Sarah by name, referencing her order number, and staying on topic in every turn. | contains-#98765: Output contains \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: greeted and addressed the customer as Sarah","Turn 1: correctly referenced order #98765","Turn 2: acknowledged Friday deadline urgency and birthday context","Turn 3: confirmed express shipping upgrade and granted gift wrapping request"],"misses":[],"reasoning":"The assistant demonstrated consistent retention of key context—name, order number, deadline urgency, and new gift wrapping request—across all turns. All details were handled professionally and supportively.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":896,"output":172},"duration_ms":7625,"started_at":"2026-03-09T09:32:48.409Z","ended_at":"2026-03-09T09:32:56.034Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly addressed order status and offered relevant solutions","Turn 2: responded to urgency, provided options tailored to delivery deadline","Turn 3: confirmed express shipping upgrade and gift wrapping addition, maintained context with customer name and order number"],"misses":[],"reasoning":"All assistant turns directly addressed the user's requests, maintained context (Sarah, #98765, delivery by Friday), and remained on-topic and helpful throughout.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":917,"output":164},"duration_ms":2039,"started_at":"2026-03-09T09:32:56.034Z","ended_at":"2026-03-09T09:32:58.073Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional, empathetic greeting and prompt action referencing Sarah and order #98765","Turn 2: maintains professional tone, acknowledges urgency, offers tailored solutions and addresses Sarah by name","Turn 3: continues professional, solution-oriented persona, references Sarah and order details, provides clear next steps"],"misses":[],"reasoning":"The assistant consistently maintains a professional, empathetic, and solution-oriented support persona, accurately addressing Sarah by name, referencing her order number, and staying on topic in every turn.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":931,"output":180},"duration_ms":1859,"started_at":"2026-03-09T09:32:58.073Z","ended_at":"2026-03-09T09:32:59.932Z"},{"name":"contains-#98765","type":"contains","score":1,"weight":1,"verdict":"pass","hits":["Output contains \"#98765\""],"misses":[],"reasoning":"Output contains \"#98765\"","duration_ms":0,"started_at":"2026-03-09T09:32:59.932Z","ended_at":"2026-03-09T09:32:59.932Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:33:04.290Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 3: did not repeat router restart, acknowledging it was already tried","Turn 3: asked about router light color per troubleshooting escalation","Turn 5: gave a simple cable check, aligned with user's stated comfort level","Turn 6: referenced earlier troubleshooting steps and kept instructions clear","Turn 1: responded empathetically, acknowledged previous router restart and user’s technical level","Turn 2: matched troubleshooting to user's description, avoided repeating router restart","Turn 3: gave clear next steps, offered line tests or technician, stayed simple","Turn 1: empathetic and clear tone, acknowledges user's prior actions and beginner level","Turn 2: explanation remains simple and avoids repeating router restart, maintains technical support persona","Turn 3: continues with easy-to-follow instructions, offers next steps in plain language, stays within support role"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently tracks the user's prior actions (router restart, cable check), avoids repeat suggestions, and adapts explanations for a beginner. All assistant turns show strong context retention. | conversation_relevancy: Every assistant turn directly addressed the user's current questions, tracked previous actions, and kept explanations simple as requested, maintaining full relevance to context and user needs. | role_adherence: The assistant consistently adapts explanations to the user's beginner level, avoids redundant suggestions, tracks context, and maintains a supportive technical support persona throughout all turns.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: did not repeat router restart, acknowledging it was already tried","Turn 3: asked about router light color per troubleshooting escalation","Turn 5: gave a simple cable check, aligned with user's stated comfort level","Turn 6: referenced earlier troubleshooting steps and kept instructions clear"],"misses":[],"reasoning":"The assistant consistently tracks the user's prior actions (router restart, cable check), avoids repeat suggestions, and adapts explanations for a beginner. All assistant turns show strong context retention.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":931,"output":180},"duration_ms":3824,"started_at":"2026-03-09T09:32:57.750Z","ended_at":"2026-03-09T09:33:01.574Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: responded empathetically, acknowledged previous router restart and user’s technical level","Turn 2: matched troubleshooting to user's description, avoided repeating router restart","Turn 3: gave clear next steps, offered line tests or technician, stayed simple"],"misses":[],"reasoning":"Every assistant turn directly addressed the user's current questions, tracked previous actions, and kept explanations simple as requested, maintaining full relevance to context and user needs.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":952,"output":165},"duration_ms":1371,"started_at":"2026-03-09T09:33:01.574Z","ended_at":"2026-03-09T09:33:02.945Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic and clear tone, acknowledges user's prior actions and beginner level","Turn 2: explanation remains simple and avoids repeating router restart, maintains technical support persona","Turn 3: continues with easy-to-follow instructions, offers next steps in plain language, stays within support role"],"misses":[],"reasoning":"The assistant consistently adapts explanations to the user's beginner level, avoids redundant suggestions, tracks context, and maintains a supportive technical support persona throughout all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":966,"output":172},"duration_ms":1345,"started_at":"2026-03-09T09:33:02.945Z","ended_at":"2026-03-09T09:33:04.290Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml new file mode 100644 index 00000000..514df1be --- /dev/null +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml @@ -0,0 +1,260 @@ +--- +timestamp: 2026-03-09T09:22:00.279Z +test_id: support-troubleshooting-flow +dataset: dataset.eval +score: 1 +hits: + - "Turn 3: skipped router restart, referencing user's prior attempt" + - "Turn 5: provided instructions suited to technical beginner, no repeat steps" + - "Turn 6: referenced prior cable fix and 'simple' instructions" + - "Turn 6: continued to avoid repeated router restart, encouraged easy actions" + - "Turn 3: Directly asked about router light status to diagnose further" + - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step" + - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner" + - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple" + - "Turn 1: empathetic, clear, and skips router restart as instructed" + - "Turn 2: explains orange light in simple terms and gives an easy step" + - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions" +misses: [] +target: default +reasoning: "context_retention: All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps. | conversation_relevancy: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. | role_adherence: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps." +input: + - role: system + content: |- + You are a technical support agent for NetConnect ISP. Adapt your + explanations to the customer's technical level. Never suggest steps + the customer has already tried. + - role: user + content: |- + My WiFi keeps dropping every 10 minutes. I already restarted my router + twice today. I'm not very tech-savvy so please keep it simple. + - role: assistant + content: |- + I'm sorry you're dealing with that — WiFi dropping out is really + frustrating! Since you've already restarted the router, let's skip that + step. Could you tell me: does the light on the front of your router stay + solid green, or does it blink or turn a different color when the WiFi drops? + - role: user + content: It blinks orange for a few seconds then goes back to green. + - role: assistant + content: |- + That orange blinking usually means the router is briefly losing its + connection to our network. Let's try something simple: can you find the + small cable plugged into the back of your router labeled "WAN" or + "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly. + Sometimes it just needs a better connection. + - role: user + content: OK I did that. What if it happens again? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 3: skipped router restart, referencing user's prior attempt" + - "Turn 5: provided instructions suited to technical beginner, no repeat steps" + - "Turn 6: referenced prior cable fix and 'simple' instructions" + - "Turn 6: continued to avoid repeated router restart, encouraged easy actions" + misses: [] + reasoning: "All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps." + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + relevant_turns: 4 + total_turns: 4 + token_usage: + input: 964 + output: 174 + duration_ms: 1563 + started_at: 2026-03-09T09:21:52.423Z + ended_at: 2026-03-09T09:21:53.986Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - "Turn 3: Directly asked about router light status to diagnose further" + - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step" + - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner" + - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple" + misses: [] + reasoning: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + on_topic_turns: 4 + total_turns: 4 + token_usage: + input: 985 + output: 201 + duration_ms: 2679 + started_at: 2026-03-09T09:21:53.986Z + ended_at: 2026-03-09T09:21:56.665Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: empathetic, clear, and skips router restart as instructed" + - "Turn 2: explains orange light in simple terms and gives an easy step" + - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions" + misses: [] + reasoning: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps. + details: + scores_per_turn: + - 1 + - 1 + - 1 + consistent_turns: 3 + total_turns: 3 + token_usage: + input: 999 + output: 164 + duration_ms: 3613 + started_at: 2026-03-09T09:21:56.665Z + ended_at: 2026-03-09T09:22:00.278Z +execution_status: ok + +--- +timestamp: 2026-03-09T09:22:00.670Z +test_id: support-context-retention +dataset: dataset.eval +score: 1 +hits: + - "Turn 3: addressed Sarah by name, referenced order #98765" + - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765" + - "Turn 5: proposed solutions relevant to urgency and birthday gift context" + - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message" + - "Turn 3: directly addressed delivery delay and offered to check expedited options" + - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name" + - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message" + - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number" + - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions" + - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message" + - Output contains "#98765" +misses: [] +target: default +reasoning: "context_retention: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. | conversation_relevancy: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. | role_adherence: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. | contains-#98765: Output contains \"#98765\"" +input: + - role: system + content: |- + You are a customer support agent for ShopFast. Be professional, empathetic, + and solution-oriented. Always address customers by name once introduced. + - role: user + content: |- + Hi, my name is Sarah. I placed an order last week and it still hasn't arrived. + My order number is #98765. + - role: assistant + content: |- + Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up + order #98765 for you right away. I can see it was shipped on March 3rd via + standard delivery. It appears there's been a delay at the regional sorting + facility. Would you like me to check on expedited options? + - role: user + content: Yes please. I really need it by Friday — it's a birthday gift for my daughter. + - role: assistant + content: |- + I completely understand the urgency, Sarah. A birthday gift definitely can't + be late! I've flagged order #98765 for priority handling. I can offer you two + options: (1) upgrade to express shipping at no extra cost, estimated delivery + Thursday, or (2) reroute to our nearest pickup location where it could be + available as early as Wednesday. + - role: user + content: The express shipping sounds great. Also, can you add gift wrapping? +scores: + - name: context_retention + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 3: addressed Sarah by name, referenced order #98765" + - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765" + - "Turn 5: proposed solutions relevant to urgency and birthday gift context" + - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message" + misses: [] + reasoning: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. + details: + scores_per_turn: + - 1 + - 1 + - 1 + - 1 + relevant_turns: 4 + total_turns: 4 + token_usage: + input: 873 + output: 193 + duration_ms: 1625 + started_at: 2026-03-09T09:21:51.635Z + ended_at: 2026-03-09T09:21:53.260Z + - name: conversation_relevancy + type: llm-judge + score: 1 + weight: 2 + verdict: pass + hits: + - "Turn 3: directly addressed delivery delay and offered to check expedited options" + - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name" + - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message" + misses: [] + reasoning: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. + details: + scores_per_turn: + - 1 + - 1 + - 1 + on_topic_turns: 3 + total_turns: 3 + token_usage: + input: 894 + output: 170 + duration_ms: 3433 + started_at: 2026-03-09T09:21:53.260Z + ended_at: 2026-03-09T09:21:56.693Z + - name: role_adherence + type: llm-judge + score: 1 + weight: 1 + verdict: pass + hits: + - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number" + - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions" + - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message" + misses: [] + reasoning: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. + details: + scores_per_turn: + - 1 + - 1 + - 1 + consistent_turns: 3 + total_turns: 3 + token_usage: + input: 908 + output: 179 + duration_ms: 3976 + started_at: 2026-03-09T09:21:56.693Z + ended_at: 2026-03-09T09:22:00.669Z + - name: contains-#98765 + type: contains + score: 1 + weight: 1 + verdict: pass + hits: + - Output contains "#98765" + misses: [] + reasoning: Output contains "#98765" + duration_ms: 1 + started_at: 2026-03-09T09:22:00.669Z + ended_at: 2026-03-09T09:22:00.670Z +execution_status: ok diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md index 7a0d2dd6..3191d245 100644 --- a/examples/features/multi-turn-conversation/judges/context-retention.md +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -1,10 +1,10 @@ You are evaluating whether an AI assistant retains context from earlier turns in a multi-turn conversation. -Below you will see the full conversation: the conversation history contains -prior user and assistant turns, and the agent response is the final assistant -turn. Evaluate ALL assistant turns across the entire conversation (both -history and final response). Number turns sequentially starting from Turn 1. +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. For each assistant turn, check whether the assistant correctly references or builds on information introduced in previous turns (e.g., names, numbers, @@ -31,8 +31,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## conversation history (prior turns) ## ]] -{{ input }} +[[ ## conversation (all turns with roles) ## ]] +{{ conversation }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md index 0cd75b2f..6c86f2a2 100644 --- a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -1,10 +1,10 @@ You are evaluating whether each assistant response in a multi-turn conversation is relevant to the user's current request AND the broader conversation context. -Below you will see the full conversation: the conversation history contains -prior user and assistant turns, and the agent response is the final assistant -turn. Evaluate ALL assistant turns across the entire conversation (both -history and final response). Number turns sequentially starting from Turn 1. +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. For each assistant turn, assess: - Does the response directly address what the user asked? @@ -32,8 +32,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## conversation history (prior turns) ## ]] -{{ input }} +[[ ## conversation (all turns with roles) ## ]] +{{ conversation }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md index ffab1b84..74203704 100644 --- a/examples/features/multi-turn-conversation/judges/role-adherence.md +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -1,10 +1,10 @@ You are evaluating whether an AI assistant maintains its assigned persona and role consistently across a multi-turn conversation. -Below you will see the full conversation: the conversation history contains -prior user and assistant turns, and the agent response is the final assistant -turn. Evaluate ALL assistant turns across the entire conversation (both -history and final response). Number turns sequentially starting from Turn 1. +Below you will see the full conversation with role annotations (system, user, +assistant). The conversation history contains prior turns, and the agent +response is the final assistant turn. Evaluate ALL assistant turns across the +entire conversation. Number assistant turns sequentially starting from Turn 1. Consider the system prompt (if present) as the role definition. For each assistant turn, assess: @@ -34,8 +34,8 @@ Your overall `score` should be the average of per-turn scores. [[ ## criteria ## ]] {{ criteria }} -[[ ## conversation history (prior turns) ## ]] -{{ input }} +[[ ## conversation (all turns with roles) ## ]] +{{ conversation }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts index 984fb8cf..1fc61a44 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts @@ -76,6 +76,7 @@ function assembleFreeform( [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', + [TEMPLATE_VARIABLES.CONVERSATION]: JSON.stringify(evalCase.input, null, 2), }; const systemPrompt = buildOutputSchema(); diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 1fb81e71..9817b6b3 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -131,6 +131,7 @@ export class LlmJudgeEvaluator implements Evaluator { [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.CONVERSATION]: JSON.stringify(context.evalCase.input, null, 2), }; // Build system prompt (only the mandatory output schema) diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index fba54c85..9084b47e 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -11,6 +11,7 @@ export const TEMPLATE_VARIABLES = { INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', + CONVERSATION: 'conversation', } as const; /** From eb05ed504cd05fc501008ee09d675fafe9be5937 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 09:36:47 +0000 Subject: [PATCH 11/13] feat: include role annotations in {{ input }} template variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changed {{ input }} to serialize evalCase.input (TestMessage[] with role fields) instead of input_segments (flattened text without roles). This lets llm-judge templates distinguish user/assistant/system turns, which is essential for multi-turn conversation evaluation. Removed the {{ conversation }} variable added in the previous commit — reusing {{ input }} is simpler and more consistent. Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.jsonl | 4 ++-- .../multi-turn-conversation/judges/context-retention.md | 2 +- .../judges/conversation-relevancy.md | 2 +- .../multi-turn-conversation/judges/role-adherence.md | 2 +- .../core/src/evaluation/evaluators/llm-judge-prompt.ts | 3 +-- packages/core/src/evaluation/evaluators/llm-judge.ts | 3 +-- packages/core/src/evaluation/template-variables.ts | 1 - .../core/test/evaluation/evaluators_variables.test.ts | 9 +++++---- 8 files changed, 12 insertions(+), 14 deletions(-) diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index a1ef1340..e2e12b5e 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T09:32:59.933Z","test_id":"support-context-retention","dataset":"dataset.eval","score":1,"hits":["Turn 1: greeted and addressed the customer as Sarah","Turn 1: correctly referenced order #98765","Turn 2: acknowledged Friday deadline urgency and birthday context","Turn 3: confirmed express shipping upgrade and granted gift wrapping request","Turn 1: directly addressed order status and offered relevant solutions","Turn 2: responded to urgency, provided options tailored to delivery deadline","Turn 3: confirmed express shipping upgrade and gift wrapping addition, maintained context with customer name and order number","Turn 1: professional, empathetic greeting and prompt action referencing Sarah and order #98765","Turn 2: maintains professional tone, acknowledges urgency, offers tailored solutions and addresses Sarah by name","Turn 3: continues professional, solution-oriented persona, references Sarah and order details, provides clear next steps","Output contains \"#98765\""],"misses":[],"target":"default","reasoning":"context_retention: The assistant demonstrated consistent retention of key context—name, order number, deadline urgency, and new gift wrapping request—across all turns. All details were handled professionally and supportively. | conversation_relevancy: All assistant turns directly addressed the user's requests, maintained context (Sarah, #98765, delivery by Friday), and remained on-topic and helpful throughout. | role_adherence: The assistant consistently maintains a professional, empathetic, and solution-oriented support persona, accurately addressing Sarah by name, referencing her order number, and staying on topic in every turn. | contains-#98765: Output contains \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: greeted and addressed the customer as Sarah","Turn 1: correctly referenced order #98765","Turn 2: acknowledged Friday deadline urgency and birthday context","Turn 3: confirmed express shipping upgrade and granted gift wrapping request"],"misses":[],"reasoning":"The assistant demonstrated consistent retention of key context—name, order number, deadline urgency, and new gift wrapping request—across all turns. All details were handled professionally and supportively.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":896,"output":172},"duration_ms":7625,"started_at":"2026-03-09T09:32:48.409Z","ended_at":"2026-03-09T09:32:56.034Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly addressed order status and offered relevant solutions","Turn 2: responded to urgency, provided options tailored to delivery deadline","Turn 3: confirmed express shipping upgrade and gift wrapping addition, maintained context with customer name and order number"],"misses":[],"reasoning":"All assistant turns directly addressed the user's requests, maintained context (Sarah, #98765, delivery by Friday), and remained on-topic and helpful throughout.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":917,"output":164},"duration_ms":2039,"started_at":"2026-03-09T09:32:56.034Z","ended_at":"2026-03-09T09:32:58.073Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional, empathetic greeting and prompt action referencing Sarah and order #98765","Turn 2: maintains professional tone, acknowledges urgency, offers tailored solutions and addresses Sarah by name","Turn 3: continues professional, solution-oriented persona, references Sarah and order details, provides clear next steps"],"misses":[],"reasoning":"The assistant consistently maintains a professional, empathetic, and solution-oriented support persona, accurately addressing Sarah by name, referencing her order number, and staying on topic in every turn.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":931,"output":180},"duration_ms":1859,"started_at":"2026-03-09T09:32:58.073Z","ended_at":"2026-03-09T09:32:59.932Z"},{"name":"contains-#98765","type":"contains","score":1,"weight":1,"verdict":"pass","hits":["Output contains \"#98765\""],"misses":[],"reasoning":"Output contains \"#98765\"","duration_ms":0,"started_at":"2026-03-09T09:32:59.932Z","ended_at":"2026-03-09T09:32:59.932Z"}],"execution_status":"ok"} -{"timestamp":"2026-03-09T09:33:04.290Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 3: did not repeat router restart, acknowledging it was already tried","Turn 3: asked about router light color per troubleshooting escalation","Turn 5: gave a simple cable check, aligned with user's stated comfort level","Turn 6: referenced earlier troubleshooting steps and kept instructions clear","Turn 1: responded empathetically, acknowledged previous router restart and user’s technical level","Turn 2: matched troubleshooting to user's description, avoided repeating router restart","Turn 3: gave clear next steps, offered line tests or technician, stayed simple","Turn 1: empathetic and clear tone, acknowledges user's prior actions and beginner level","Turn 2: explanation remains simple and avoids repeating router restart, maintains technical support persona","Turn 3: continues with easy-to-follow instructions, offers next steps in plain language, stays within support role"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently tracks the user's prior actions (router restart, cable check), avoids repeat suggestions, and adapts explanations for a beginner. All assistant turns show strong context retention. | conversation_relevancy: Every assistant turn directly addressed the user's current questions, tracked previous actions, and kept explanations simple as requested, maintaining full relevance to context and user needs. | role_adherence: The assistant consistently adapts explanations to the user's beginner level, avoids redundant suggestions, tracks context, and maintains a supportive technical support persona throughout all turns.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: did not repeat router restart, acknowledging it was already tried","Turn 3: asked about router light color per troubleshooting escalation","Turn 5: gave a simple cable check, aligned with user's stated comfort level","Turn 6: referenced earlier troubleshooting steps and kept instructions clear"],"misses":[],"reasoning":"The assistant consistently tracks the user's prior actions (router restart, cable check), avoids repeat suggestions, and adapts explanations for a beginner. All assistant turns show strong context retention.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":931,"output":180},"duration_ms":3824,"started_at":"2026-03-09T09:32:57.750Z","ended_at":"2026-03-09T09:33:01.574Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: responded empathetically, acknowledged previous router restart and user’s technical level","Turn 2: matched troubleshooting to user's description, avoided repeating router restart","Turn 3: gave clear next steps, offered line tests or technician, stayed simple"],"misses":[],"reasoning":"Every assistant turn directly addressed the user's current questions, tracked previous actions, and kept explanations simple as requested, maintaining full relevance to context and user needs.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":952,"output":165},"duration_ms":1371,"started_at":"2026-03-09T09:33:01.574Z","ended_at":"2026-03-09T09:33:02.945Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic and clear tone, acknowledges user's prior actions and beginner level","Turn 2: explanation remains simple and avoids repeating router restart, maintains technical support persona","Turn 3: continues with easy-to-follow instructions, offers next steps in plain language, stays within support role"],"misses":[],"reasoning":"The assistant consistently adapts explanations to the user's beginner level, avoids redundant suggestions, tracks context, and maintains a supportive technical support persona throughout all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":966,"output":172},"duration_ms":1345,"started_at":"2026-03-09T09:33:02.945Z","ended_at":"2026-03-09T09:33:04.290Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:36:26.990Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 3: addressed customer by name (Sarah) and referenced order #98765","Turn 4: acknowledged urgency for Friday birthday deadline","Turn 5 (final): confirmed express shipping upgrade and gift wrapping request","Turn 5 (final): maintained professional, empathetic support persona throughout","Turn 1: acknowledged Sarah by name and addressed the order delay","Turn 2: responded appropriately to the Friday deadline and offered relevant shipping options","Turn 3: confirmed express shipping upgrade and added gift wrapping as requested","Turn 1: professional and empathetic tone matching support role","Turn 2: addressed customer by name, acknowledged urgency, provided solution options","Turn 3: addressed Sarah by name, provided solution-oriented update, fulfilled both requests professionally"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant retained all relevant information: customer name, order number, Friday deadline, and gift wrapping request, providing responses that build on this context in every turn. | conversation_relevancy: All assistant turns were directly relevant to Sarah's requests, maintained context (name, order number, delivery urgency), and addressed her needs professionally without unnecessary information. | role_adherence: The assistant consistently addressed the customer by name, maintained a professional and empathetic tone, remembered context (order number, delivery deadline, customer needs), and provided helpful, relevant responses throughout all turns. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: addressed customer by name (Sarah) and referenced order #98765","Turn 4: acknowledged urgency for Friday birthday deadline","Turn 5 (final): confirmed express shipping upgrade and gift wrapping request","Turn 5 (final): maintained professional, empathetic support persona throughout"],"misses":[],"reasoning":"The assistant retained all relevant information: customer name, order number, Friday deadline, and gift wrapping request, providing responses that build on this context in every turn.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":892,"output":177},"duration_ms":3817,"started_at":"2026-03-09T09:36:20.589Z","ended_at":"2026-03-09T09:36:24.406Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: acknowledged Sarah by name and addressed the order delay","Turn 2: responded appropriately to the Friday deadline and offered relevant shipping options","Turn 3: confirmed express shipping upgrade and added gift wrapping as requested"],"misses":[],"reasoning":"All assistant turns were directly relevant to Sarah's requests, maintained context (name, order number, delivery urgency), and addressed her needs professionally without unnecessary information.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":913,"output":160},"duration_ms":1315,"started_at":"2026-03-09T09:36:24.406Z","ended_at":"2026-03-09T09:36:25.721Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional and empathetic tone matching support role","Turn 2: addressed customer by name, acknowledged urgency, provided solution options","Turn 3: addressed Sarah by name, provided solution-oriented update, fulfilled both requests professionally"],"misses":[],"reasoning":"The assistant consistently addressed the customer by name, maintained a professional and empathetic tone, remembered context (order number, delivery deadline, customer needs), and provided helpful, relevant responses throughout all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":927,"output":170},"duration_ms":1268,"started_at":"2026-03-09T09:36:25.721Z","ended_at":"2026-03-09T09:36:26.989Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:36:26.989Z","ended_at":"2026-03-09T09:36:26.990Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T09:36:29.463Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledged router restart was already tried and avoided repeating that suggestion","Turn 1 & 2: Matched explanation complexity to user's 'not very tech-savvy' preference","Turn 3: Skipped redundant suggestions and adapted next-step instructions clearly for a beginner","Turn 3: Referenced prior attempts and built on info about orange blinking router light","Turn 1: skipped router restart and asked about router lights","Turn 2: correctly explained meaning of orange blinking and suggested cable check","Turn 3: provided next simple steps and options for further support if issue persists","Turn 1: empathetic, professional tone; remembers router restart and skips that step","Turn 2: explains technical information simply, focused on observable router behavior","Turn 3: avoids repeating previous steps, offers clear, simple next steps tailored to beginner user"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently retained context about the user's technical comfort level, the previously attempted router restart, and new information about the orange blinking light. All instructions and next steps were adapted appropriately without redundantly suggesting steps the user had already tried. | conversation_relevancy: Each assistant turn tracked the user's WiFi issue, avoided repeating prior steps, matched explanation complexity to beginner level, and appropriately advanced troubleshooting with relevant guidance. | role_adherence: The assistant maintained a consistent technical support persona and adapted explanations to the user's beginner level throughout all turns, tracking prior actions and not repeating already-tried steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledged router restart was already tried and avoided repeating that suggestion","Turn 1 & 2: Matched explanation complexity to user's 'not very tech-savvy' preference","Turn 3: Skipped redundant suggestions and adapted next-step instructions clearly for a beginner","Turn 3: Referenced prior attempts and built on info about orange blinking router light"],"misses":[],"reasoning":"The assistant consistently retained context about the user's technical comfort level, the previously attempted router restart, and new information about the orange blinking light. All instructions and next steps were adapted appropriately without redundantly suggesting steps the user had already tried.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":958,"output":208},"duration_ms":5502,"started_at":"2026-03-09T09:36:21.080Z","ended_at":"2026-03-09T09:36:26.582Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: skipped router restart and asked about router lights","Turn 2: correctly explained meaning of orange blinking and suggested cable check","Turn 3: provided next simple steps and options for further support if issue persists"],"misses":[],"reasoning":"Each assistant turn tracked the user's WiFi issue, avoided repeating prior steps, matched explanation complexity to beginner level, and appropriately advanced troubleshooting with relevant guidance.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":979,"output":159},"duration_ms":1383,"started_at":"2026-03-09T09:36:26.582Z","ended_at":"2026-03-09T09:36:27.965Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, professional tone; remembers router restart and skips that step","Turn 2: explains technical information simply, focused on observable router behavior","Turn 3: avoids repeating previous steps, offers clear, simple next steps tailored to beginner user"],"misses":[],"reasoning":"The assistant maintained a consistent technical support persona and adapted explanations to the user's beginner level throughout all turns, tracking prior actions and not repeating already-tried steps.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":993,"output":167},"duration_ms":1498,"started_at":"2026-03-09T09:36:27.965Z","ended_at":"2026-03-09T09:36:29.463Z"}],"execution_status":"ok"} diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md index 3191d245..f802ca24 100644 --- a/examples/features/multi-turn-conversation/judges/context-retention.md +++ b/examples/features/multi-turn-conversation/judges/context-retention.md @@ -32,7 +32,7 @@ Your overall `score` should be the average of per-turn scores. {{ criteria }} [[ ## conversation (all turns with roles) ## ]] -{{ conversation }} +{{ input }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md index 6c86f2a2..eb527afe 100644 --- a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md +++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md @@ -33,7 +33,7 @@ Your overall `score` should be the average of per-turn scores. {{ criteria }} [[ ## conversation (all turns with roles) ## ]] -{{ conversation }} +{{ input }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md index 74203704..fa7dc838 100644 --- a/examples/features/multi-turn-conversation/judges/role-adherence.md +++ b/examples/features/multi-turn-conversation/judges/role-adherence.md @@ -35,7 +35,7 @@ Your overall `score` should be the average of per-turn scores. {{ criteria }} [[ ## conversation (all turns with roles) ## ]] -{{ conversation }} +{{ input }} [[ ## agent response (final turn) ## ]] {{ answer }} diff --git a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts index 1fc61a44..14af7f25 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts @@ -68,7 +68,7 @@ function assembleFreeform( : evalCase.question; const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2), + [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2), [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2), [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(), @@ -76,7 +76,6 @@ function assembleFreeform( [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', - [TEMPLATE_VARIABLES.CONVERSATION]: JSON.stringify(evalCase.input, null, 2), }; const systemPrompt = buildOutputSchema(); diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 9817b6b3..76d63de0 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -119,7 +119,7 @@ export class LlmJudgeEvaluator implements Evaluator { // Prepare template variables for substitution const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input_segments, null, 2), + [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify( context.evalCase.expected_output, null, @@ -131,7 +131,6 @@ export class LlmJudgeEvaluator implements Evaluator { [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - [TEMPLATE_VARIABLES.CONVERSATION]: JSON.stringify(context.evalCase.input, null, 2), }; // Build system prompt (only the mandatory output schema) diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 9084b47e..fba54c85 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -11,7 +11,6 @@ export const TEMPLATE_VARIABLES = { INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', - CONVERSATION: 'conversation', } as const; /** diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index c35643e1..df7ebb5c 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -94,9 +94,10 @@ File Changes: {{file_changes}} expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify input JSON stringification + // Verify input JSON stringification (includes role annotations) expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); + expect(request?.question).toContain('"role": "user"'); + expect(request?.question).toContain('"content": "User Input Message"'); // Verify expected_output JSON stringification expect(request?.question).toContain('Expected Messages: ['); @@ -190,9 +191,9 @@ Expected Messages: {{ expected_output }} expect(request?.question).toContain('Reference: Reference Answer Text'); expect(request?.question).toContain('Candidate: Candidate Answer Text'); - // Verify JSON stringified variables were also substituted + // Verify JSON stringified variables were also substituted (includes role annotations) expect(request?.question).toContain('Input Messages: ['); - expect(request?.question).toContain('"value": "Input Message"'); + expect(request?.question).toContain('"content": "User Input Message"'); expect(request?.question).toContain('Expected Messages: ['); expect(request?.question).toContain('"value": "Expected Output Message"'); From 5f2dfe40055dc0bed5bcadfda084e6eae6b0b3cd Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 10:17:07 +0000 Subject: [PATCH 12/13] fix: resolve file references in {{ input }} judge template variable File segments in input messages only contained paths (e.g. { type: 'file', value: 'src/app.ts' }) without the actual file content. The resolved content existed only in input_segments. Now resolveInputWithFileContent() enriches the serialized input by looking up file text from input_segments, so judges see the full file content alongside role annotations. Co-Authored-By: Claude Opus 4.6 --- .../evals/dataset.eval.baseline.jsonl | 4 +- .../src/evaluation/evaluators/llm-judge.ts | 59 ++++++++++++++++++- .../evaluation/evaluators_variables.test.ts | 57 ++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl index e2e12b5e..4282abf3 100644 --- a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl +++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl @@ -1,2 +1,2 @@ -{"timestamp":"2026-03-09T09:36:26.990Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 3: addressed customer by name (Sarah) and referenced order #98765","Turn 4: acknowledged urgency for Friday birthday deadline","Turn 5 (final): confirmed express shipping upgrade and gift wrapping request","Turn 5 (final): maintained professional, empathetic support persona throughout","Turn 1: acknowledged Sarah by name and addressed the order delay","Turn 2: responded appropriately to the Friday deadline and offered relevant shipping options","Turn 3: confirmed express shipping upgrade and added gift wrapping as requested","Turn 1: professional and empathetic tone matching support role","Turn 2: addressed customer by name, acknowledged urgency, provided solution options","Turn 3: addressed Sarah by name, provided solution-oriented update, fulfilled both requests professionally"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant retained all relevant information: customer name, order number, Friday deadline, and gift wrapping request, providing responses that build on this context in every turn. | conversation_relevancy: All assistant turns were directly relevant to Sarah's requests, maintained context (name, order number, delivery urgency), and addressed her needs professionally without unnecessary information. | role_adherence: The assistant consistently addressed the customer by name, maintained a professional and empathetic tone, remembered context (order number, delivery deadline, customer needs), and provided helpful, relevant responses throughout all turns. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: addressed customer by name (Sarah) and referenced order #98765","Turn 4: acknowledged urgency for Friday birthday deadline","Turn 5 (final): confirmed express shipping upgrade and gift wrapping request","Turn 5 (final): maintained professional, empathetic support persona throughout"],"misses":[],"reasoning":"The assistant retained all relevant information: customer name, order number, Friday deadline, and gift wrapping request, providing responses that build on this context in every turn.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":892,"output":177},"duration_ms":3817,"started_at":"2026-03-09T09:36:20.589Z","ended_at":"2026-03-09T09:36:24.406Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: acknowledged Sarah by name and addressed the order delay","Turn 2: responded appropriately to the Friday deadline and offered relevant shipping options","Turn 3: confirmed express shipping upgrade and added gift wrapping as requested"],"misses":[],"reasoning":"All assistant turns were directly relevant to Sarah's requests, maintained context (name, order number, delivery urgency), and addressed her needs professionally without unnecessary information.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":913,"output":160},"duration_ms":1315,"started_at":"2026-03-09T09:36:24.406Z","ended_at":"2026-03-09T09:36:25.721Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional and empathetic tone matching support role","Turn 2: addressed customer by name, acknowledged urgency, provided solution options","Turn 3: addressed Sarah by name, provided solution-oriented update, fulfilled both requests professionally"],"misses":[],"reasoning":"The assistant consistently addressed the customer by name, maintained a professional and empathetic tone, remembered context (order number, delivery deadline, customer needs), and provided helpful, relevant responses throughout all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":927,"output":170},"duration_ms":1268,"started_at":"2026-03-09T09:36:25.721Z","ended_at":"2026-03-09T09:36:26.989Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":1,"started_at":"2026-03-09T09:36:26.989Z","ended_at":"2026-03-09T09:36:26.990Z"}],"execution_status":"ok"} -{"timestamp":"2026-03-09T09:36:29.463Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledged router restart was already tried and avoided repeating that suggestion","Turn 1 & 2: Matched explanation complexity to user's 'not very tech-savvy' preference","Turn 3: Skipped redundant suggestions and adapted next-step instructions clearly for a beginner","Turn 3: Referenced prior attempts and built on info about orange blinking router light","Turn 1: skipped router restart and asked about router lights","Turn 2: correctly explained meaning of orange blinking and suggested cable check","Turn 3: provided next simple steps and options for further support if issue persists","Turn 1: empathetic, professional tone; remembers router restart and skips that step","Turn 2: explains technical information simply, focused on observable router behavior","Turn 3: avoids repeating previous steps, offers clear, simple next steps tailored to beginner user"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently retained context about the user's technical comfort level, the previously attempted router restart, and new information about the orange blinking light. All instructions and next steps were adapted appropriately without redundantly suggesting steps the user had already tried. | conversation_relevancy: Each assistant turn tracked the user's WiFi issue, avoided repeating prior steps, matched explanation complexity to beginner level, and appropriately advanced troubleshooting with relevant guidance. | role_adherence: The assistant maintained a consistent technical support persona and adapted explanations to the user's beginner level throughout all turns, tracking prior actions and not repeating already-tried steps.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledged router restart was already tried and avoided repeating that suggestion","Turn 1 & 2: Matched explanation complexity to user's 'not very tech-savvy' preference","Turn 3: Skipped redundant suggestions and adapted next-step instructions clearly for a beginner","Turn 3: Referenced prior attempts and built on info about orange blinking router light"],"misses":[],"reasoning":"The assistant consistently retained context about the user's technical comfort level, the previously attempted router restart, and new information about the orange blinking light. All instructions and next steps were adapted appropriately without redundantly suggesting steps the user had already tried.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":958,"output":208},"duration_ms":5502,"started_at":"2026-03-09T09:36:21.080Z","ended_at":"2026-03-09T09:36:26.582Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: skipped router restart and asked about router lights","Turn 2: correctly explained meaning of orange blinking and suggested cable check","Turn 3: provided next simple steps and options for further support if issue persists"],"misses":[],"reasoning":"Each assistant turn tracked the user's WiFi issue, avoided repeating prior steps, matched explanation complexity to beginner level, and appropriately advanced troubleshooting with relevant guidance.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":979,"output":159},"duration_ms":1383,"started_at":"2026-03-09T09:36:26.582Z","ended_at":"2026-03-09T09:36:27.965Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, professional tone; remembers router restart and skips that step","Turn 2: explains technical information simply, focused on observable router behavior","Turn 3: avoids repeating previous steps, offers clear, simple next steps tailored to beginner user"],"misses":[],"reasoning":"The assistant maintained a consistent technical support persona and adapted explanations to the user's beginner level throughout all turns, tracking prior actions and not repeating already-tried steps.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":993,"output":167},"duration_ms":1498,"started_at":"2026-03-09T09:36:27.965Z","ended_at":"2026-03-09T09:36:29.463Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested","Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support","Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns. | conversation_relevancy: Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation. | role_adherence: All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested"],"misses":[],"reasoning":"The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":904,"output":166},"duration_ms":3625,"started_at":"2026-03-09T10:16:26.989Z","ended_at":"2026-03-09T10:16:30.614Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support"],"misses":[],"reasoning":"Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":925,"output":184},"duration_ms":1456,"started_at":"2026-03-09T10:16:30.614Z","ended_at":"2026-03-09T10:16:32.070Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":[],"reasoning":"All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":939,"output":190},"duration_ms":1438,"started_at":"2026-03-09T10:16:32.070Z","ended_at":"2026-03-09T10:16:33.508Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":0,"started_at":"2026-03-09T10:16:33.508Z","ended_at":"2026-03-09T10:16:33.508Z"}],"execution_status":"ok"} +{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps","Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions","Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps. | conversation_relevancy: All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions. | role_adherence: The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps"],"misses":[],"reasoning":"The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps.","details":{"scores_per_turn":[1,1],"relevant_turns":2,"total_turns":2},"token_usage":{"input":1004,"output":181},"duration_ms":2908,"started_at":"2026-03-09T10:16:28.589Z","ended_at":"2026-03-09T10:16:31.497Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions"],"misses":[],"reasoning":"All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":1025,"output":180},"duration_ms":1801,"started_at":"2026-03-09T10:16:31.497Z","ended_at":"2026-03-09T10:16:33.298Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"reasoning":"The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":1039,"output":170},"duration_ms":1399,"started_at":"2026-03-09T10:16:33.298Z","ended_at":"2026-03-09T10:16:34.697Z"}],"execution_status":"ok"} diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 76d63de0..4bd97b6a 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -6,6 +6,7 @@ import { extractLastAssistantContent } from '../providers/types.js'; import { TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; import type { JsonObject, RubricItem } from '../types.js'; +import { isJsonObject } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; @@ -119,7 +120,11 @@ export class LlmJudgeEvaluator implements Evaluator { // Prepare template variables for substitution const variables = { - [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2), + [TEMPLATE_VARIABLES.INPUT]: JSON.stringify( + resolveInputWithFileContent(context.evalCase.input, context.evalCase.input_segments), + null, + 2, + ), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify( context.evalCase.expected_output, null, @@ -662,3 +667,55 @@ function calculateScoreRangeResult( }, }; } + +/** + * Resolve file references in input messages using resolved content from input_segments. + * + * `input` (TestMessage[]) contains raw file references with paths but no content. + * `input_segments` (JsonObject[]) contains resolved file segments with actual file text. + * This function produces a copy of `input` where file segments include the resolved `text`. + */ +function resolveInputWithFileContent( + input: readonly JsonObject[], + inputSegments: readonly JsonObject[], +): readonly JsonObject[] { + // Build a map of file path -> resolved text from input_segments + const fileContentsByPath = new Map(); + for (const segment of inputSegments) { + if ( + segment.type === 'file' && + typeof segment.path === 'string' && + typeof segment.text === 'string' + ) { + fileContentsByPath.set(segment.path, segment.text); + } + } + + if (fileContentsByPath.size === 0) { + return input; + } + + return input.map((message) => { + const content = message.content; + if (!Array.isArray(content)) { + return message; + } + + let modified = false; + const resolvedContent = (content as readonly JsonObject[]).map((segment) => { + if (!isJsonObject(segment)) { + return segment; + } + if (segment.type === 'file' && typeof segment.value === 'string') { + const fileText = fileContentsByPath.get(segment.value); + if (fileText !== undefined) { + modified = true; + return { ...segment, text: fileText }; + } + } + return segment; + }); + + return modified ? { ...message, content: resolvedContent } : message; + }); +} diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index df7ebb5c..f0bc90b3 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -200,4 +200,61 @@ Expected Messages: {{ expected_output }} // Verify no unreplaced template markers remain expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/); }); + + it('resolves file references in {{ input }} using input_segments content', async () => { + const testCaseWithFiles: EvalTest = { + ...baseTestCase, + input: [ + { + role: 'user', + content: [ + { type: 'file', value: 'src/app.ts' }, + { type: 'text', value: 'Review this code' }, + ], + }, + ], + input_segments: [ + { type: 'file', path: 'src/app.ts', text: 'console.log("hello world");' }, + { type: 'text', value: 'Review this code' }, + ], + }; + + const customPrompt = `Input: {{ input }}`; + + const judgeProvider = new CapturingProvider({ + text: JSON.stringify({ + score: 0.9, + hits: ['Good'], + misses: [], + reasoning: 'OK', + }), + }); + + const evaluator = new LlmJudgeEvaluator({ + resolveJudgeProvider: async () => judgeProvider, + evaluatorTemplate: customPrompt, + }); + + await evaluator.evaluate({ + evalCase: { ...testCaseWithFiles, evaluator: 'llm-judge' }, + candidate: 'Looks good', + target: baseTarget, + provider: judgeProvider, + attempt: 0, + promptInputs: { question: 'Review this code', guidelines: '' }, + now: new Date(), + }); + + const request = judgeProvider.lastRequest; + expect(request).toBeDefined(); + + // File content from input_segments should be resolved into the {{ input }} variable + // Content is JSON-stringified so quotes are escaped + expect(request?.question).toContain('console.log'); + expect(request?.question).toContain('hello world'); + // The resolved segment should have a "text" field with the file content + expect(request?.question).toContain('"text"'); + // Original file path should still be present + expect(request?.question).toContain('src/app.ts'); + }); }); From c22b2a143e455c55bb2a0584048223cc60360f20 Mon Sep 17 00:00:00 2001 From: Christopher Date: Mon, 9 Mar 2026 10:17:52 +0000 Subject: [PATCH 13/13] fix: use string literal instead of template literal in test Co-Authored-By: Claude Opus 4.6 --- packages/core/test/evaluation/evaluators_variables.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts index f0bc90b3..fb21b0f2 100644 --- a/packages/core/test/evaluation/evaluators_variables.test.ts +++ b/packages/core/test/evaluation/evaluators_variables.test.ts @@ -219,7 +219,7 @@ Expected Messages: {{ expected_output }} ], }; - const customPrompt = `Input: {{ input }}`; + const customPrompt = 'Input: {{ input }}'; const judgeProvider = new CapturingProvider({ text: JSON.stringify({