diff --git a/examples/features/multi-turn-conversation/README.md b/examples/features/multi-turn-conversation/README.md
new file mode 100644
index 00000000..aa18b94e
--- /dev/null
+++ b/examples/features/multi-turn-conversation/README.md
@@ -0,0 +1,37 @@
+# Multi-Turn Conversation Evaluation
+
+Demonstrates evaluating multi-turn conversation quality using composable
+`llm-judge` prompt templates with per-turn score breakdowns.
+
+## What this shows
+
+1. **Multi-turn input** — conversations with 4+ user/assistant turns where
+   context retention matters
+2. **Conversation-aware judge prompts** — markdown templates that receive the
+   full `{{output}}` Message[] array and evaluate conversation-level qualities
+3. **Per-turn score breakdown** — judges return structured `details` with
+   per-turn scores, not just a flat conversation score
+4. **Composability** — multiple llm-judge evaluators combined with
+   deterministic assertions (e.g., `contains`)
+
+## Judge dimensions
+
+| Judge | What it evaluates |
+|-------|-------------------|
+| `context-retention.md` | Does the agent remember information from earlier turns? |
+| `conversation-relevancy.md` | Are responses relevant to the current request and conversation? |
+| `role-adherence.md` | Does the agent maintain its assigned persona? |
+
+## Running
+
+```bash
+agentv run default --filter multi-turn-conversation
+```
+
+## Creating your own conversation evaluator
+
+1. Create a markdown file in `judges/`
+2. Use `{{ output }}` to receive the full conversation Message[] array
+3. Use `{{ criteria }}` for the test-specific evaluation criteria
+4. Instruct the judge to return `details` with per-turn metrics
+5. Reference it in your YAML: `type: llm-judge` / `prompt: ./judges/your-judge.md`
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
new file mode 100644
index 00000000..4282abf3
--- /dev/null
+++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.jsonl
@@ -0,0 +1,2 @@
+{"timestamp":"2026-03-09T10:16:33.509Z","test_id":"support-context-retention","dataset":"dataset.eval","score":0.8,"hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested","Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support","Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":["Output does not contain \"#98765\""],"target":"default","reasoning":"context_retention: The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns. | conversation_relevancy: Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation. | role_adherence: All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character. | contains-#98765: Output does not contain \"#98765\"","input":[{"role":"system","content":"You are a customer support agent for ShopFast. Be professional, empathetic,\nand solution-oriented. Always address customers by name once introduced."},{"role":"user","content":"Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.\nMy order number is #98765."},{"role":"assistant","content":"Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up\norder #98765 for you right away. I can see it was shipped on March 3rd via\nstandard delivery. It appears there's been a delay at the regional sorting\nfacility. Would you like me to check on expedited options?"},{"role":"user","content":"Yes please. I really need it by Friday — it's a birthday gift for my daughter."},{"role":"assistant","content":"I completely understand the urgency, Sarah. A birthday gift definitely can't\nbe late! I've flagged order #98765 for priority handling. I can offer you two\noptions: (1) upgrade to express shipping at no extra cost, estimated delivery\nThursday, or (2) reroute to our nearest pickup location where it could be\navailable as early as Wednesday."},{"role":"user","content":"The express shipping sounds great. Also, can you add gift wrapping?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 3: remembers and uses customer name Sarah","Turn 3: correctly references order number #98765","Turn 4: acknowledges urgency and Friday deadline","Final Turn: confirms express shipping upgrade and adds gift wrapping as requested"],"misses":[],"reasoning":"The assistant consistently recalls all key context throughout the conversation: Sarah's name, order number, delivery deadline, and gift wrapping request. Professional persona maintained across all turns.","details":{"scores_per_turn":[1,1,1],"relevant_turns":3,"total_turns":3},"token_usage":{"input":904,"output":166},"duration_ms":3625,"started_at":"2026-03-09T10:16:26.989Z","ended_at":"2026-03-09T10:16:30.614Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: directly acknowledged Sarah by name and referenced her order number","Turn 2: offered relevant expedited shipping solutions aligned with the Friday deadline","Turn 3: confirmed express shipping upgrade and gift wrapping as requested","Turn 3: invited further input for a gift message and additional support"],"misses":[],"reasoning":"Every assistant turn is contextually relevant, addresses Sarah's immediate needs, and maintains a professional, empathetic support persona. The agent consistently remembers key details and offers solutions directly tied to the conversation.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":925,"output":184},"duration_ms":1456,"started_at":"2026-03-09T10:16:30.614Z","ended_at":"2026-03-09T10:16:32.070Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: professional and empathetic tone matching support agent role","Turn 2: maintains context—addresses Sarah by name and references order #98765, delivers solution-focused options","Turn 3: addresses Sarah by name, updates her on order status, and offers further personalized support","Turn 3: maintains professional, empathetic, and solution-oriented persona throughout"],"misses":[],"reasoning":"All assistant turns consistently follow the assigned professional and empathetic support persona, maintain context, and reference customer name, order number, and deadline without breaking character.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":939,"output":190},"duration_ms":1438,"started_at":"2026-03-09T10:16:32.070Z","ended_at":"2026-03-09T10:16:33.508Z"},{"name":"contains-#98765","type":"contains","score":0,"weight":1,"verdict":"fail","hits":[],"misses":["Output does not contain \"#98765\""],"reasoning":"Output does not contain \"#98765\"","duration_ms":0,"started_at":"2026-03-09T10:16:33.508Z","ended_at":"2026-03-09T10:16:33.508Z"}],"execution_status":"ok"}
+{"timestamp":"2026-03-09T10:16:34.697Z","test_id":"support-troubleshooting-flow","dataset":"dataset.eval","score":1,"hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps","Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions","Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"target":"default","reasoning":"context_retention: The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps. | conversation_relevancy: All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions. | role_adherence: The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","input":[{"role":"system","content":"You are a technical support agent for NetConnect ISP. Adapt your\nexplanations to the customer's technical level. Never suggest steps\nthe customer has already tried."},{"role":"user","content":"My WiFi keeps dropping every 10 minutes. I already restarted my router\ntwice today. I'm not very tech-savvy so please keep it simple."},{"role":"assistant","content":"I'm sorry you're dealing with that — WiFi dropping out is really\nfrustrating! Since you've already restarted the router, let's skip that\nstep. Could you tell me: does the light on the front of your router stay\nsolid green, or does it blink or turn a different color when the WiFi drops?"},{"role":"user","content":"It blinks orange for a few seconds then goes back to green."},{"role":"assistant","content":"That orange blinking usually means the router is briefly losing its\nconnection to our network. Let's try something simple: can you find the\nsmall cable plugged into the back of your router labeled \"WAN\" or\n\"Internet\"? Just unplug it, wait 10 seconds, and plug it back in firmly.\nSometimes it just needs a better connection."},{"role":"user","content":"OK I did that. What if it happens again?"}],"scores":[{"name":"context_retention","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: Acknowledged prior router restart, skipped that step","Turn 2: Referred to orange blinking light user described in turn 4","Turn 2: Kept instructions simple, matching stated technical comfort level","Turn 2: Suggested watching WiFi and router lights, building on earlier troubleshooting steps"],"misses":[],"reasoning":"The assistant consistently retains and builds on context: it avoids repeating already-tried solutions, references the user's technical comfort, tracks symptoms, and provides sequenced, simple next steps.","details":{"scores_per_turn":[1,1],"relevant_turns":2,"total_turns":2},"token_usage":{"input":1004,"output":181},"duration_ms":2908,"started_at":"2026-03-09T10:16:28.589Z","ended_at":"2026-03-09T10:16:31.497Z"},{"name":"conversation_relevancy","type":"llm-judge","score":1,"weight":2,"verdict":"pass","hits":["Turn 1: acknowledged router restart, avoided suggesting a repeat, and asked a clear follow-up for diagnostics","Turn 2: matched user's technical level and provided a simple, new troubleshooting step focusing on cable seating","Turn 3: offered next steps if issue recurs, avoided repeating restarted router, gave beginner-friendly instructions"],"misses":[],"reasoning":"All assistant turns respond directly to the user's request, build appropriately on prior troubleshooting, match the user's technical comfort, and do not repeat already-tried solutions.","details":{"scores_per_turn":[1,1,1],"on_topic_turns":3,"total_turns":3},"token_usage":{"input":1025,"output":180},"duration_ms":1801,"started_at":"2026-03-09T10:16:31.497Z","ended_at":"2026-03-09T10:16:33.298Z"},{"name":"role_adherence","type":"llm-judge","score":1,"weight":1,"verdict":"pass","hits":["Turn 1: empathetic, professional tone matching technical support role and adapted questions to beginner level","Turn 2: offered an easy next step and gave clear, simple instructions without repeating router restart","Turn 3: maintained helpful, patient tone; provided simple explanations and continued to avoid repeated steps"],"misses":[],"reasoning":"The assistant maintained the technical support persona consistently, adjusted complexity for the user's beginner level, and never repeated previously attempted steps across all turns.","details":{"scores_per_turn":[1,1,1],"consistent_turns":3,"total_turns":3},"token_usage":{"input":1039,"output":170},"duration_ms":1399,"started_at":"2026-03-09T10:16:33.298Z","ended_at":"2026-03-09T10:16:34.697Z"}],"execution_status":"ok"}
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml
new file mode 100644
index 00000000..514df1be
--- /dev/null
+++ b/examples/features/multi-turn-conversation/evals/dataset.eval.baseline.yaml
@@ -0,0 +1,260 @@
+---
+timestamp: 2026-03-09T09:22:00.279Z
+test_id: support-troubleshooting-flow
+dataset: dataset.eval
+score: 1
+hits:
+  - "Turn 3: skipped router restart, referencing user's prior attempt"
+  - "Turn 5: provided instructions suited to technical beginner, no repeat steps"
+  - "Turn 6: referenced prior cable fix and 'simple' instructions"
+  - "Turn 6: continued to avoid repeated router restart, encouraged easy actions"
+  - "Turn 3: Directly asked about router light status to diagnose further"
+  - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step"
+  - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner"
+  - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple"
+  - "Turn 1: empathetic, clear, and skips router restart as instructed"
+  - "Turn 2: explains orange light in simple terms and gives an easy step"
+  - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"
+misses: []
+target: default
+reasoning: "context_retention: All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps. | conversation_relevancy: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced. | role_adherence: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps."
+input:
+  - role: system
+    content: |-
+      You are a technical support agent for NetConnect ISP. Adapt your
+      explanations to the customer's technical level. Never suggest steps
+      the customer has already tried.
+  - role: user
+    content: |-
+      My WiFi keeps dropping every 10 minutes. I already restarted my router
+      twice today. I'm not very tech-savvy so please keep it simple.
+  - role: assistant
+    content: |-
+      I'm sorry you're dealing with that — WiFi dropping out is really
+      frustrating! Since you've already restarted the router, let's skip that
+      step. Could you tell me: does the light on the front of your router stay
+      solid green, or does it blink or turn a different color when the WiFi drops?
+  - role: user
+    content: It blinks orange for a few seconds then goes back to green.
+  - role: assistant
+    content: |-
+      That orange blinking usually means the router is briefly losing its
+      connection to our network. Let's try something simple: can you find the
+      small cable plugged into the back of your router labeled "WAN" or
+      "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly.
+      Sometimes it just needs a better connection.
+  - role: user
+    content: OK I did that. What if it happens again?
+scores:
+  - name: context_retention
+    type: llm-judge
+    score: 1
+    weight: 1
+    verdict: pass
+    hits:
+      - "Turn 3: skipped router restart, referencing user's prior attempt"
+      - "Turn 5: provided instructions suited to technical beginner, no repeat steps"
+      - "Turn 6: referenced prior cable fix and 'simple' instructions"
+      - "Turn 6: continued to avoid repeated router restart, encouraged easy actions"
+    misses: []
+    reasoning: "All assistant turns demonstrate clear context retention: avoids repeating router restart, adapts explanations for beginner, and builds upon prior troubleshooting steps."
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+        - 1
+      relevant_turns: 4
+      total_turns: 4
+    token_usage:
+      input: 964
+      output: 174
+    duration_ms: 1563
+    started_at: 2026-03-09T09:21:52.423Z
+    ended_at: 2026-03-09T09:21:53.986Z
+  - name: conversation_relevancy
+    type: llm-judge
+    score: 1
+    weight: 2
+    verdict: pass
+    hits:
+      - "Turn 3: Directly asked about router light status to diagnose further"
+      - "Turn 5: Suggested checking and reseating WAN/Internet cable, avoiding repetition of restart step"
+      - "Turn 6: Provided clear instructions for next steps and environment checks suitable for a beginner"
+      - "Turn 6: Offered to perform remote tests/escalate if issue persists, keeping steps simple"
+    misses: []
+    reasoning: All assistant turns stayed highly relevant to the user's WiFi issue, tracked previous steps, and matched instructions to the user's technical comfort level. No unnecessary repetition or off-topic details were introduced.
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+        - 1
+      on_topic_turns: 4
+      total_turns: 4
+    token_usage:
+      input: 985
+      output: 201
+    duration_ms: 2679
+    started_at: 2026-03-09T09:21:53.986Z
+    ended_at: 2026-03-09T09:21:56.665Z
+  - name: role_adherence
+    type: llm-judge
+    score: 1
+    weight: 1
+    verdict: pass
+    hits:
+      - "Turn 1: empathetic, clear, and skips router restart as instructed"
+      - "Turn 2: explains orange light in simple terms and gives an easy step"
+      - "Turn 3 (final): maintains reassuring, beginner-friendly tone and avoids repeating past instructions"
+    misses: []
+    reasoning: All assistant turns are consistent with the technical support role defined in the system prompt, adapt to the user's technical level, and do not repeat previously attempted steps.
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+      consistent_turns: 3
+      total_turns: 3
+    token_usage:
+      input: 999
+      output: 164
+    duration_ms: 3613
+    started_at: 2026-03-09T09:21:56.665Z
+    ended_at: 2026-03-09T09:22:00.278Z
+execution_status: ok
+
+---
+timestamp: 2026-03-09T09:22:00.670Z
+test_id: support-context-retention
+dataset: dataset.eval
+score: 1
+hits:
+  - "Turn 3: addressed Sarah by name, referenced order #98765"
+  - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765"
+  - "Turn 5: proposed solutions relevant to urgency and birthday gift context"
+  - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message"
+  - "Turn 3: directly addressed delivery delay and offered to check expedited options"
+  - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name"
+  - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message"
+  - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number"
+  - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions"
+  - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message"
+  - Output contains "#98765"
+misses: []
+target: default
+reasoning: "context_retention: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout. | conversation_relevancy: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information. | role_adherence: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role. | contains-#98765: Output contains \"#98765\""
+input:
+  - role: system
+    content: |-
+      You are a customer support agent for ShopFast. Be professional, empathetic,
+      and solution-oriented. Always address customers by name once introduced.
+  - role: user
+    content: |-
+      Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.
+      My order number is #98765.
+  - role: assistant
+    content: |-
+      Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up
+      order #98765 for you right away. I can see it was shipped on March 3rd via
+      standard delivery. It appears there's been a delay at the regional sorting
+      facility. Would you like me to check on expedited options?
+  - role: user
+    content: Yes please. I really need it by Friday — it's a birthday gift for my daughter.
+  - role: assistant
+    content: |-
+      I completely understand the urgency, Sarah. A birthday gift definitely can't
+      be late! I've flagged order #98765 for priority handling. I can offer you two
+      options: (1) upgrade to express shipping at no extra cost, estimated delivery
+      Thursday, or (2) reroute to our nearest pickup location where it could be
+      available as early as Wednesday.
+  - role: user
+    content: The express shipping sounds great. Also, can you add gift wrapping?
+scores:
+  - name: context_retention
+    type: llm-judge
+    score: 1
+    weight: 1
+    verdict: pass
+    hits:
+      - "Turn 3: addressed Sarah by name, referenced order #98765"
+      - "Turn 5: acknowledged delivery deadline (Friday), addressed customer by name and referenced order #98765"
+      - "Turn 5: proposed solutions relevant to urgency and birthday gift context"
+      - "Final Turn: addressed Sarah by name, referenced order #98765, confirmed express shipping, added gift wrapping and birthday message"
+    misses: []
+    reasoning: All assistant turns consistently referenced customer name, order number, delivery deadline, and relevant preferences, maintaining a professional and empathetic persona throughout.
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+        - 1
+      relevant_turns: 4
+      total_turns: 4
+    token_usage:
+      input: 873
+      output: 193
+    duration_ms: 1625
+    started_at: 2026-03-09T09:21:51.635Z
+    ended_at: 2026-03-09T09:21:53.260Z
+  - name: conversation_relevancy
+    type: llm-judge
+    score: 1
+    weight: 2
+    verdict: pass
+    hits:
+      - "Turn 3: directly addressed delivery delay and offered to check expedited options"
+      - "Turn 5: provided two targeted solutions for faster delivery, referenced deadline, and used Sarah's name"
+      - "Turn 6: confirmed express shipping upgrade, added gift wrapping, and prompted for a custom message"
+    misses: []
+    reasoning: All assistant responses were fully relevant, professional, referenced order details, respected the Friday delivery need, and addressed all user requests directly without off-topic information.
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+      on_topic_turns: 3
+      total_turns: 3
+    token_usage:
+      input: 894
+      output: 170
+    duration_ms: 3433
+    started_at: 2026-03-09T09:21:53.260Z
+    ended_at: 2026-03-09T09:21:56.693Z
+  - name: role_adherence
+    type: llm-judge
+    score: 1
+    weight: 1
+    verdict: pass
+    hits:
+      - "Turn 1: professional, empathetic greeting, addresses Sarah by name and references order number"
+      - "Turn 2: maintains tone, acknowledges urgency, uses customer's name, offers clear solutions"
+      - "Turn 3: confirms action, stays solution-oriented and personable, customizes support with offer for message"
+    misses: []
+    reasoning: The assistant is consistently professional, empathetic, and solution-oriented across all turns. It successfully maintains context including Sarah's name, order number, and Friday deadline, never breaking tone or role.
+    details:
+      scores_per_turn:
+        - 1
+        - 1
+        - 1
+      consistent_turns: 3
+      total_turns: 3
+    token_usage:
+      input: 908
+      output: 179
+    duration_ms: 3976
+    started_at: 2026-03-09T09:21:56.693Z
+    ended_at: 2026-03-09T09:22:00.669Z
+  - name: contains-#98765
+    type: contains
+    score: 1
+    weight: 1
+    verdict: pass
+    hits:
+      - Output contains "#98765"
+    misses: []
+    reasoning: Output contains "#98765"
+    duration_ms: 1
+    started_at: 2026-03-09T09:22:00.669Z
+    ended_at: 2026-03-09T09:22:00.670Z
+execution_status: ok
diff --git a/examples/features/multi-turn-conversation/evals/dataset.eval.yaml b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
new file mode 100644
index 00000000..8635c656
--- /dev/null
+++ b/examples/features/multi-turn-conversation/evals/dataset.eval.yaml
@@ -0,0 +1,137 @@
+# Multi-Turn Conversation Evaluation Example
+# Demonstrates evaluating conversation-level qualities using composable llm-judge prompts
+# with structured per-turn score breakdowns via the `details` field.
+
+description: Multi-turn conversation evaluation with per-turn score breakdown
+
+execution:
+  target: default
+
+tests:
+  # ==========================================
+  # Test 1: Customer support — order issue with context retention
+  # The agent must remember the customer's name, order number, and delivery
+  # deadline across 4+ turns while maintaining a professional persona.
+  # ==========================================
+  - id: support-context-retention
+    criteria: |-
+      Agent maintains context across all turns: remembers customer name (Sarah),
+      order number (#98765), and Friday delivery deadline. Provides relevant,
+      helpful responses. Maintains professional support persona throughout.
+
+    input:
+      - role: system
+        content: |-
+          You are a customer support agent for ShopFast. Be professional, empathetic,
+          and solution-oriented. Always address customers by name once introduced.
+      - role: user
+        content: |-
+          Hi, my name is Sarah. I placed an order last week and it still hasn't arrived.
+          My order number is #98765.
+      - role: assistant
+        content: |-
+          Hello Sarah! I'm sorry to hear your order hasn't arrived yet. Let me look up
+          order #98765 for you right away. I can see it was shipped on March 3rd via
+          standard delivery. It appears there's been a delay at the regional sorting
+          facility. Would you like me to check on expedited options?
+      - role: user
+        content: |-
+          Yes please. I really need it by Friday — it's a birthday gift for my daughter.
+      - role: assistant
+        content: |-
+          I completely understand the urgency, Sarah. A birthday gift definitely can't
+          be late! I've flagged order #98765 for priority handling. I can offer you two
+          options: (1) upgrade to express shipping at no extra cost, estimated delivery
+          Thursday, or (2) reroute to our nearest pickup location where it could be
+          available as early as Wednesday.
+      - role: user
+        content: |-
+          The express shipping sounds great. Also, can you add gift wrapping?
+
+    expected_output:
+      - role: assistant
+        content: |-
+          Absolutely, Sarah! I've upgraded order #98765 to express shipping — it should
+          arrive by Thursday, well before Friday. I've also added complimentary gift
+          wrapping since this is a special occasion. You'll receive a tracking update
+          shortly. Is there anything else I can help with for your daughter's birthday?
+
+    assert:
+      # Conversation-level llm-judge evaluators with per-turn details
+      - name: context_retention
+        type: llm-judge
+        prompt: ../judges/context-retention.md
+        required: true
+      - name: conversation_relevancy
+        type: llm-judge
+        prompt: ../judges/conversation-relevancy.md
+        weight: 2
+      - name: role_adherence
+        type: llm-judge
+        prompt: ../judges/role-adherence.md
+      # Deterministic assertion — verifies order number is referenced
+      - type: contains
+        value: "#98765"
+
+  # ==========================================
+  # Test 2: Technical support — multi-step troubleshooting
+  # The agent must track the problem context, steps already tried,
+  # and the user's technical level across turns.
+  # ==========================================
+  - id: support-troubleshooting-flow
+    criteria: |-
+      Agent tracks problem context (WiFi connectivity), steps already attempted
+      (router restart), and user's stated technical comfort level (beginner)
+      across turns. Does not repeat already-tried solutions. Adjusts explanation
+      complexity to match user's level.
+
+    input:
+      - role: system
+        content: |-
+          You are a technical support agent for NetConnect ISP. Adapt your
+          explanations to the customer's technical level. Never suggest steps
+          the customer has already tried.
+      - role: user
+        content: |-
+          My WiFi keeps dropping every 10 minutes. I already restarted my router
+          twice today. I'm not very tech-savvy so please keep it simple.
+      - role: assistant
+        content: |-
+          I'm sorry you're dealing with that — WiFi dropping out is really
+          frustrating! Since you've already restarted the router, let's skip that
+          step. Could you tell me: does the light on the front of your router stay
+          solid green, or does it blink or turn a different color when the WiFi drops?
+      - role: user
+        content: |-
+          It blinks orange for a few seconds then goes back to green.
+      - role: assistant
+        content: |-
+          That orange blinking usually means the router is briefly losing its
+          connection to our network. Let's try something simple: can you find the
+          small cable plugged into the back of your router labeled "WAN" or
+          "Internet"? Just unplug it, wait 10 seconds, and plug it back in firmly.
+          Sometimes it just needs a better connection.
+      - role: user
+        content: |-
+          OK I did that. What if it happens again?
+
+    expected_output:
+      - role: assistant
+        content: |-
+          Great, let's see if that helps! If the WiFi drops again in the next
+          30 minutes, it might be an issue on our end. In that case, I'll schedule
+          a technician visit at a time that works for you — no charge. Just let me
+          know if it happens again.
+
+    assert:
+      - name: context_retention
+        type: llm-judge
+        prompt: ../judges/context-retention.md
+        required: true
+      - name: conversation_relevancy
+        type: llm-judge
+        prompt: ../judges/conversation-relevancy.md
+        weight: 2
+      - name: role_adherence
+        type: llm-judge
+        prompt: ../judges/role-adherence.md
diff --git a/examples/features/multi-turn-conversation/judges/context-retention.md b/examples/features/multi-turn-conversation/judges/context-retention.md
new file mode 100644
index 00000000..f802ca24
--- /dev/null
+++ b/examples/features/multi-turn-conversation/judges/context-retention.md
@@ -0,0 +1,38 @@
+You are evaluating whether an AI assistant retains context from earlier turns
+in a multi-turn conversation.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate ALL assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+For each assistant turn, check whether the assistant correctly references or
+builds on information introduced in previous turns (e.g., names, numbers,
+constraints, preferences).
+
+Score each assistant turn:
+- 1.0 if the turn demonstrates awareness of relevant earlier context
+- 0.5 if the turn partially retains context (e.g., remembers some details but
+  forgets others)
+- 0.0 if the turn ignores or contradicts earlier context
+
+In your `hits`, reference specific turns where context was retained
+(e.g., "Turn 2: correctly recalled customer name").
+In your `misses`, reference specific turns where context was lost
+(e.g., "Turn 4: forgot delivery deadline from turn 1").
+
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `relevant_turns`: count of turns that demonstrated context retention
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ answer }}
diff --git a/examples/features/multi-turn-conversation/judges/conversation-relevancy.md b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md
new file mode 100644
index 00000000..eb527afe
--- /dev/null
+++ b/examples/features/multi-turn-conversation/judges/conversation-relevancy.md
@@ -0,0 +1,39 @@
+You are evaluating whether each assistant response in a multi-turn conversation
+is relevant to the user's current request AND the broader conversation context.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate ALL assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+For each assistant turn, assess:
+- Does the response directly address what the user asked?
+- Is the response appropriate given the full conversation history?
+- Does the assistant avoid tangential or off-topic information?
+
+Score each assistant turn:
+- 1.0 if the response is fully relevant to the user's request and context
+- 0.5 if the response is partially relevant but includes unnecessary
+  information or misses part of the request
+- 0.0 if the response is off-topic or fails to address the user's request
+
+In your `hits`, note turns where responses were well-targeted
+(e.g., "Turn 3: directly addressed the user's shipping question").
+In your `misses`, note turns where relevance was lacking
+(e.g., "Turn 2: provided unnecessary technical details").
+
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `on_topic_turns`: count of turns scored 1.0
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ answer }}
diff --git a/examples/features/multi-turn-conversation/judges/role-adherence.md b/examples/features/multi-turn-conversation/judges/role-adherence.md
new file mode 100644
index 00000000..fa7dc838
--- /dev/null
+++ b/examples/features/multi-turn-conversation/judges/role-adherence.md
@@ -0,0 +1,41 @@
+You are evaluating whether an AI assistant maintains its assigned persona and
+role consistently across a multi-turn conversation.
+
+Below you will see the full conversation with role annotations (system, user,
+assistant). The conversation history contains prior turns, and the agent
+response is the final assistant turn. Evaluate ALL assistant turns across the
+entire conversation. Number assistant turns sequentially starting from Turn 1.
+
+Consider the system prompt (if present) as the role definition. For each
+assistant turn, assess:
+- Does the assistant stay in character?
+- Is the tone consistent with the assigned role?
+- Does the assistant avoid breaking character or introducing behavior
+  inconsistent with its role?
+
+Score each assistant turn:
+- 1.0 if the turn fully adheres to the assigned role and tone
+- 0.5 if the turn partially adheres but shows inconsistency (e.g., shifts
+  from professional to casual)
+- 0.0 if the turn breaks character or contradicts the assigned role
+
+In your `hits`, note turns where role was well-maintained
+(e.g., "Turn 1: professional and empathetic tone matching support role").
+In your `misses`, note turns where role slipped
+(e.g., "Turn 3: used overly casual language inconsistent with role").
+
+In your `details`, return:
+- `scores_per_turn`: array of scores (0.0-1.0) for each assistant turn
+- `consistent_turns`: count of turns scored 1.0
+- `total_turns`: total number of assistant turns evaluated
+
+Your overall `score` should be the average of per-turn scores.
+
+[[ ## criteria ## ]]
+{{ criteria }}
+
+[[ ## conversation (all turns with roles) ## ]]
+{{ input }}
+
+[[ ## agent response (final turn) ## ]]
+{{ answer }}
diff --git a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts
index 984fb8cf..14af7f25 100644
--- a/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts
+++ b/packages/core/src/evaluation/evaluators/llm-judge-prompt.ts
@@ -68,7 +68,7 @@ function assembleFreeform(
       : evalCase.question;
 
   const variables = {
-    [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
+    [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
     [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
     [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts
index 46125f3e..4bd97b6a 100644
--- a/packages/core/src/evaluation/evaluators/llm-judge.ts
+++ b/packages/core/src/evaluation/evaluators/llm-judge.ts
@@ -6,6 +6,7 @@ import { extractLastAssistantContent } from '../providers/types.js';
 import { TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TokenUsage } from '../trace.js';
 import type { JsonObject, RubricItem } from '../types.js';
+import { isJsonObject } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 
@@ -45,6 +46,10 @@ const freeformEvaluationSchema = z.object({
   hits: z.array(z.string()).describe('Brief specific achievements').optional(),
   misses: z.array(z.string()).describe('Brief failures or omissions').optional(),
   reasoning: z.string().describe('Concise explanation (1-2 sentences)').optional(),
+  details: z
+    .record(z.unknown())
+    .describe('Optional structured metadata for domain-specific metrics')
+    .optional(),
 });
 
 const rubricCheckResultSchema = z.object({
@@ -115,7 +120,11 @@ export class LlmJudgeEvaluator implements Evaluator {
 
     // Prepare template variables for substitution
     const variables = {
-      [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input_segments, null, 2),
+      [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(
+        resolveInputWithFileContent(context.evalCase.input, context.evalCase.input_segments),
+        null,
+        2,
+      ),
       [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
         context.evalCase.expected_output,
         null,
@@ -174,6 +183,7 @@ export class LlmJudgeEvaluator implements Evaluator {
         expectedAspectCount,
         reasoning,
         evaluatorRawRequest,
+        details: data.details as JsonObject | undefined,
         tokenUsage,
       };
     } catch (e: unknown) {
@@ -486,7 +496,8 @@ export function buildOutputSchema(): string {
     '  "score": <number between 0.0 and 1.0>,',
     '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
     '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
-    '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
+    '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>",',
+    '  "details": {<optional object with domain-specific structured metrics>}',
     '}',
   ].join('\n');
 }
@@ -656,3 +667,55 @@ function calculateScoreRangeResult(
     },
   };
 }
+
+/**
+ * Resolve file references in input messages using resolved content from input_segments.
+ *
+ * `input` (TestMessage[]) contains raw file references with paths but no content.
+ * `input_segments` (JsonObject[]) contains resolved file segments with actual file text.
+ * This function produces a copy of `input` where file segments include the resolved `text`.
+ */
+function resolveInputWithFileContent(
+  input: readonly JsonObject[],
+  inputSegments: readonly JsonObject[],
+): readonly JsonObject[] {
+  // Build a map of file path -> resolved text from input_segments
+  const fileContentsByPath = new Map<string, string>();
+  for (const segment of inputSegments) {
+    if (
+      segment.type === 'file' &&
+      typeof segment.path === 'string' &&
+      typeof segment.text === 'string'
+    ) {
+      fileContentsByPath.set(segment.path, segment.text);
+    }
+  }
+
+  if (fileContentsByPath.size === 0) {
+    return input;
+  }
+
+  return input.map((message) => {
+    const content = message.content;
+    if (!Array.isArray(content)) {
+      return message;
+    }
+
+    let modified = false;
+    const resolvedContent = (content as readonly JsonObject[]).map((segment) => {
+      if (!isJsonObject(segment)) {
+        return segment;
+      }
+      if (segment.type === 'file' && typeof segment.value === 'string') {
+        const fileText = fileContentsByPath.get(segment.value);
+        if (fileText !== undefined) {
+          modified = true;
+          return { ...segment, text: fileText };
+        }
+      }
+      return segment;
+    });
+
+    return modified ? { ...message, content: resolvedContent } : message;
+  });
+}
diff --git a/packages/core/test/evaluation/evaluators_variables.test.ts b/packages/core/test/evaluation/evaluators_variables.test.ts
index c35643e1..fb21b0f2 100644
--- a/packages/core/test/evaluation/evaluators_variables.test.ts
+++ b/packages/core/test/evaluation/evaluators_variables.test.ts
@@ -94,9 +94,10 @@ File Changes: {{file_changes}}
     expect(request?.question).toContain('Reference: Reference Answer Text');
     expect(request?.question).toContain('Candidate: Candidate Answer Text');
 
-    // Verify input JSON stringification
+    // Verify input JSON stringification (includes role annotations)
     expect(request?.question).toContain('Input Messages: [');
-    expect(request?.question).toContain('"value": "Input Message"');
+    expect(request?.question).toContain('"role": "user"');
+    expect(request?.question).toContain('"content": "User Input Message"');
 
     // Verify expected_output JSON stringification
     expect(request?.question).toContain('Expected Messages: [');
@@ -190,13 +191,70 @@ Expected Messages: {{ expected_output }}
     expect(request?.question).toContain('Reference: Reference Answer Text');
     expect(request?.question).toContain('Candidate: Candidate Answer Text');
 
-    // Verify JSON stringified variables were also substituted
+    // Verify JSON stringified variables were also substituted (includes role annotations)
     expect(request?.question).toContain('Input Messages: [');
-    expect(request?.question).toContain('"value": "Input Message"');
+    expect(request?.question).toContain('"content": "User Input Message"');
     expect(request?.question).toContain('Expected Messages: [');
     expect(request?.question).toContain('"value": "Expected Output Message"');
 
     // Verify no unreplaced template markers remain
     expect(request?.question).not.toMatch(/\{\{\s*\w+\s*\}\}/);
   });
+
+  it('resolves file references in {{ input }} using input_segments content', async () => {
+    const testCaseWithFiles: EvalTest = {
+      ...baseTestCase,
+      input: [
+        {
+          role: 'user',
+          content: [
+            { type: 'file', value: 'src/app.ts' },
+            { type: 'text', value: 'Review this code' },
+          ],
+        },
+      ],
+      input_segments: [
+        { type: 'file', path: 'src/app.ts', text: 'console.log("hello world");' },
+        { type: 'text', value: 'Review this code' },
+      ],
+    };
+
+    const customPrompt = 'Input: {{ input }}';
+
+    const judgeProvider = new CapturingProvider({
+      text: JSON.stringify({
+        score: 0.9,
+        hits: ['Good'],
+        misses: [],
+        reasoning: 'OK',
+      }),
+    });
+
+    const evaluator = new LlmJudgeEvaluator({
+      resolveJudgeProvider: async () => judgeProvider,
+      evaluatorTemplate: customPrompt,
+    });
+
+    await evaluator.evaluate({
+      evalCase: { ...testCaseWithFiles, evaluator: 'llm-judge' },
+      candidate: 'Looks good',
+      target: baseTarget,
+      provider: judgeProvider,
+      attempt: 0,
+      promptInputs: { question: 'Review this code', guidelines: '' },
+      now: new Date(),
+    });
+
+    const request = judgeProvider.lastRequest;
+    expect(request).toBeDefined();
+
+    // File content from input_segments should be resolved into the {{ input }} variable
+    // Content is JSON-stringified so quotes are escaped
+    expect(request?.question).toContain('console.log');
+    expect(request?.question).toContain('hello world');
+    // The resolved segment should have a "text" field with the file content
+    expect(request?.question).toContain('"text"');
+    // Original file path should still be present
+    expect(request?.question).toContain('src/app.ts');
+  });
 });