From 25f6b70792eeb6c23dee473fb5050df2f946a61a Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Fri, 13 Mar 2026 18:33:35 +0100
Subject: [PATCH 1/5] Validate llm judge

Signed-off-by: David Gageot <david.gageot@docker.com>
---
 pkg/evaluation/eval.go       | 30 ++++++++++++++++---
 pkg/evaluation/eval_test.go  | 57 ++++++++++++++++++++++++++++++++++++
 pkg/evaluation/judge.go      | 46 ++++++++++++++++++++++++-----
 pkg/evaluation/judge_test.go | 13 ++++----
 4 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
index aaf73ef38..ad1b60a48 100644
--- a/pkg/evaluation/eval.go
+++ b/pkg/evaluation/eval.go
@@ -117,6 +117,20 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
 		return nil, fmt.Errorf("loading evaluations: %w", err)
 	}
 
+	// Check whether any evaluations require relevance checking.
+	// If so, the judge must be configured and working; validate eagerly
+	// to fail fast on configuration issues (bad API key, wrong model, etc.)
+	// instead of silently producing zero-relevance results.
+	if needsJudge(evals) {
+		if r.judge == nil {
+			return nil, errors.New("some evaluations have relevance criteria but no judge model is configured (use --judge-model)")
+		}
+		fmt.Fprintln(out, "Validating judge model...")
+		if err := r.judge.Validate(ctx); err != nil {
+			return nil, fmt.Errorf("%w", err)
+		}
+	}
+
 	// Pre-build all unique Docker images in parallel before running evaluations.
 	// This avoids serialized builds when multiple workers need the same image.
 	if err := r.preBuildImages(ctx, out, evals); err != nil {
@@ -341,12 +355,12 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 	if r.judge != nil && len(evals.Relevance) > 0 {
 		// Use transcript for relevance checking to preserve temporal ordering
 		transcript := buildTranscript(events)
-		passed, failed, errs := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
+		passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
+		if err != nil {
+			return result, fmt.Errorf("relevance check failed: %w", err)
+		}
 		result.RelevancePassed = float64(passed)
 		result.FailedRelevance = failed
-		for _, e := range errs {
-			slog.Warn("Relevance check error", "title", evalSess.Title, "error", e)
-		}
 	}
 
 	slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
@@ -590,6 +604,14 @@ func matchesAnyPattern(name string, patterns []string) bool {
 	})
 }
 
+// needsJudge returns true if any evaluation session has relevance criteria,
+// meaning a judge model is required to evaluate them.
+func needsJudge(evals []InputSession) bool {
+	return slices.ContainsFunc(evals, func(s InputSession) bool {
+		return s.Evals != nil && len(s.Evals.Relevance) > 0
+	})
+}
+
 // createJudgeModel creates a provider.Provider from a model string (format: provider/model).
 // Returns nil if judgeModel is empty.
 func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.RuntimeConfig) (provider.Provider, error) {
diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go
index 5fbf86607..6cb593508 100644
--- a/pkg/evaluation/eval_test.go
+++ b/pkg/evaluation/eval_test.go
@@ -12,6 +12,8 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
+	"github.com/docker/docker-agent/pkg/session"
 )
 
 func TestToolCallF1Score(t *testing.T) {
@@ -1009,3 +1011,58 @@ func TestMatchesAnyPattern(t *testing.T) {
 		})
 	}
 }
+
+func TestNeedsJudge(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name  string
+		evals []InputSession
+		want  bool
+	}{
+		{
+			name:  "no evals",
+			evals: nil,
+			want:  false,
+		},
+		{
+			name: "evals without relevance criteria",
+			evals: []InputSession{
+				{Session: &session.Session{Evals: &session.EvalCriteria{Size: "M"}}},
+				{Session: &session.Session{Evals: &session.EvalCriteria{}}},
+			},
+			want: false,
+		},
+		{
+			name: "evals with nil Evals field",
+			evals: []InputSession{
+				{Session: &session.Session{}},
+			},
+			want: false,
+		},
+		{
+			name: "some evals with relevance criteria",
+			evals: []InputSession{
+				{Session: &session.Session{Evals: &session.EvalCriteria{}}},
+				{Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"criterion1"}}}},
+			},
+			want: true,
+		},
+		{
+			name: "all evals with relevance criteria",
+			evals: []InputSession{
+				{Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"a", "b"}}}},
+				{Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"c"}}}},
+			},
+			want: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			got := needsJudge(tt.evals)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
index c530022c2..9aae65006 100644
--- a/pkg/evaluation/judge.go
+++ b/pkg/evaluation/judge.go
@@ -82,6 +82,29 @@ func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurre
 	}
 }
 
+// Validate performs an end-to-end check of the judge model by sending a
+// trivial relevance prompt and verifying the response is valid structured
+// JSON. This catches configuration errors (bad API key, unsupported model,
+// missing structured-output support, etc.) before running any evaluations,
+// allowing the framework to fail fast.
+func (j *Judge) Validate(ctx context.Context) error {
+	const (
+		testResponse  = "The sky is blue."
+		testCriterion = "The response mentions a color."
+	)
+
+	passed, _, err := j.checkSingle(ctx, testResponse, testCriterion)
+	if err != nil {
+		return fmt.Errorf("judge model validation failed: %w", err)
+	}
+
+	if !passed {
+		return errors.New("judge model validation failed: expected the test criterion to pass but the judge returned 'fail'")
+	}
+
+	return nil
+}
+
 // RelevanceResult contains the result of a single relevance check.
 type RelevanceResult struct {
 	Criterion string `json:"criterion"`
@@ -89,8 +112,11 @@ type RelevanceResult struct {
 }
 
 // CheckRelevance runs all relevance checks concurrently with the configured concurrency.
-// It returns the number of passed checks, a slice of failed results with reasons, and any errors encountered.
-func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, errs []string) {
+// It returns the number of passed checks, a slice of failed results with reasons, and an error
+// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard
+// failure so that configuration issues are surfaced immediately rather than silently producing
+// zero-relevance results.
+func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) {
 	if len(criteria) == 0 {
 		return 0, nil, nil
 	}
@@ -122,17 +148,19 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 					results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())}
 					continue
 				}
-				pass, reason, err := j.checkSingle(ctx, response, item.criterion)
-				results[item.index] = result{passed: pass, reason: reason, err: err}
+				pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion)
+				results[item.index] = result{passed: pass, reason: reason, err: checkErr}
 			}
 		})
 	}
 	wg.Wait()
 
-	// Aggregate results
+	// Aggregate results. Any error is fatal — return it immediately so the
+	// caller can fail fast on judge misconfiguration.
+	var errs []error
 	for i, r := range results {
 		if r.err != nil {
-			errs = append(errs, fmt.Sprintf("error checking %q: %v", criteria[i], r.err))
+			errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
 			continue
 		}
 		if r.passed {
@@ -145,7 +173,11 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 		}
 	}
 
-	return passed, failed, errs
+	if len(errs) > 0 {
+		return passed, failed, errors.Join(errs...)
+	}
+
+	return passed, failed, nil
 }
 
 // getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go
index fe9594a67..aba53be18 100644
--- a/pkg/evaluation/judge_test.go
+++ b/pkg/evaluation/judge_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestNewJudge(t *testing.T) {
@@ -46,11 +47,11 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
 	t.Parallel()
 
 	judge := NewJudge(nil, nil, 1)
-	passed, failed, errs := judge.CheckRelevance(t.Context(), "some response", nil)
+	passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
 
 	assert.Equal(t, 0, passed)
 	assert.Empty(t, failed)
-	assert.Empty(t, errs)
+	assert.NoError(t, err)
 }
 
 func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
@@ -62,13 +63,11 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
 	cancel() // Cancel immediately
 
 	criteria := []string{"criterion1", "criterion2", "criterion3"}
-	passed, failed, errs := judge.CheckRelevance(ctx, "some response", criteria)
+	passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria)
 
 	// All should have errors due to context cancellation
 	assert.Equal(t, 0, passed)
 	assert.Empty(t, failed)
-	assert.Len(t, errs, 3)
-	for _, err := range errs {
-		assert.Contains(t, err, "context cancelled")
-	}
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "context cancelled")
 }

From 37ec4d169d3221613615daa9314112355421c1ae Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Fri, 13 Mar 2026 20:18:51 +0100
Subject: [PATCH 2/5] No thinking for llm as a judge

Signed-off-by: David Gageot <david.gageot@docker.com>
---
 pkg/evaluation/eval.go  | 4 +++-
 pkg/evaluation/judge.go | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
index ad1b60a48..4a174bdf2 100644
--- a/pkg/evaluation/eval.go
+++ b/pkg/evaluation/eval.go
@@ -624,7 +624,9 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
 		return nil, fmt.Errorf("invalid judge model format %q: expected 'provider/model'", judgeModel)
 	}
 
-	var opts []options.Opt
+	opts := []options.Opt{
+		options.WithThinking(false),
+	}
 	if runConfig.ModelsGateway != "" {
 		opts = append(opts, options.WithGateway(runConfig.ModelsGateway))
 	}
diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
index 9aae65006..7fb41953f 100644
--- a/pkg/evaluation/judge.go
+++ b/pkg/evaluation/judge.go
@@ -194,6 +194,7 @@ func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provid
 
 	opts := []options.Opt{
 		options.WithStructuredOutput(judgeResponseSchema),
+		options.WithThinking(false),
 	}
 	if j.runConfig.ModelsGateway != "" {
 		opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway))

From f11de8a3221132fbf1b3a33d9a93131cd11f2ea4 Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Fri, 13 Mar 2026 20:23:30 +0100
Subject: [PATCH 3/5] Remove duplication around llm as a judge

Signed-off-by: David Gageot <david.gageot@docker.com>
---
 pkg/evaluation/eval.go       |  3 ++-
 pkg/evaluation/judge.go      | 50 ++----------------------------------
 pkg/evaluation/judge_test.go |  6 ++---
 3 files changed, 7 insertions(+), 52 deletions(-)

diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
index 4a174bdf2..d1202cf37 100644
--- a/pkg/evaluation/eval.go
+++ b/pkg/evaluation/eval.go
@@ -49,7 +49,7 @@ type Runner struct {
 func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner {
 	var judge *Judge
 	if judgeModel != nil {
-		judge = NewJudge(judgeModel, runConfig, cfg.Concurrency)
+		judge = NewJudge(judgeModel, cfg.Concurrency)
 	}
 	return &Runner{
 		Config:      cfg,
@@ -626,6 +626,7 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
 
 	opts := []options.Opt{
 		options.WithThinking(false),
+		options.WithStructuredOutput(judgeResponseSchema),
 	}
 	if runConfig.ModelsGateway != "" {
 		opts = append(opts, options.WithGateway(runConfig.ModelsGateway))
diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
index 7fb41953f..6062132ae 100644
--- a/pkg/evaluation/judge.go
+++ b/pkg/evaluation/judge.go
@@ -11,10 +11,8 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/chat"
-	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/model/provider"
-	"github.com/docker/docker-agent/pkg/model/provider/options"
 )
 
 // relevancePrompt is the prompt template for the judge model to evaluate responses.
@@ -58,26 +56,17 @@ var judgeResponseSchema = &latest.StructuredOutput{
 // Judge runs LLM-as-a-judge relevance checks concurrently.
 type Judge struct {
 	model       provider.Provider
-	runConfig   *config.RuntimeConfig
 	concurrency int
-
-	// judgeWithSchema is a provider pre-configured with structured output.
-	// Created lazily on first use and reused across all relevance checks.
-	// Protected by judgeWithSchemaMu; only cached on success so that
-	// transient errors (e.g. context cancellation) can be retried.
-	judgeWithSchema   provider.Provider
-	judgeWithSchemaMu sync.Mutex
 }
 
 // NewJudge creates a new Judge that runs relevance checks with the given concurrency.
 // Concurrency defaults to 1 if n < 1.
-func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurrency int) *Judge {
+func NewJudge(model provider.Provider, concurrency int) *Judge {
 	if concurrency < 1 {
 		concurrency = 1
 	}
 	return &Judge{
 		model:       model,
-		runConfig:   runConfig,
 		concurrency: concurrency,
 	}
 }
@@ -180,48 +169,13 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 	return passed, failed, nil
 }
 
-// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
-// The provider is created once and reused across all relevance checks.
-// Unlike sync.Once, transient failures (e.g. context cancellation) are not
-// cached, allowing subsequent calls to retry.
-func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provider, error) {
-	j.judgeWithSchemaMu.Lock()
-	defer j.judgeWithSchemaMu.Unlock()
-
-	if j.judgeWithSchema != nil {
-		return j.judgeWithSchema, nil
-	}
-
-	opts := []options.Opt{
-		options.WithStructuredOutput(judgeResponseSchema),
-		options.WithThinking(false),
-	}
-	if j.runConfig.ModelsGateway != "" {
-		opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway))
-	}
-
-	modelCfg := j.model.BaseConfig().ModelConfig
-	p, err := provider.New(ctx, &modelCfg, j.runConfig.EnvProvider(), opts...)
-	if err != nil {
-		return nil, err
-	}
-
-	j.judgeWithSchema = p
-	return j.judgeWithSchema, nil
-}
-
 // checkSingle checks a single relevance criterion against the response.
 // It returns whether the check passed, the reason provided by the judge, and any error.
 func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) {
-	judgeWithSchema, err := j.getOrCreateJudgeWithSchema(ctx)
-	if err != nil {
-		return false, "", fmt.Errorf("creating judge provider with structured output: %w", err)
-	}
-
 	prompt := fmt.Sprintf(relevancePrompt, response, criterion)
 	messages := []chat.Message{{Role: chat.MessageRoleUser, Content: prompt}}
 
-	stream, err := judgeWithSchema.CreateChatCompletionStream(ctx, messages, nil)
+	stream, err := j.model.CreateChatCompletionStream(ctx, messages, nil)
 	if err != nil {
 		return false, "", fmt.Errorf("creating chat completion: %w", err)
 	}
diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go
index aba53be18..4b54ca64e 100644
--- a/pkg/evaluation/judge_test.go
+++ b/pkg/evaluation/judge_test.go
@@ -37,7 +37,7 @@ func TestNewJudge(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 
-			judge := NewJudge(nil, nil, tt.concurrency)
+			judge := NewJudge(nil, tt.concurrency)
 			assert.Equal(t, tt.expectedConcurrency, judge.concurrency)
 		})
 	}
@@ -46,7 +46,7 @@ func TestNewJudge(t *testing.T) {
 func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
 	t.Parallel()
 
-	judge := NewJudge(nil, nil, 1)
+	judge := NewJudge(nil, 1)
 	passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
 
 	assert.Equal(t, 0, passed)
@@ -57,7 +57,7 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
 func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
 	t.Parallel()
 
-	judge := NewJudge(nil, nil, 2)
+	judge := NewJudge(nil, 2)
 
 	ctx, cancel := context.WithCancel(t.Context())
 	cancel() // Cancel immediately

From 388ae8b9fe90bb996fa2a5faa9e980c8f60fcb95 Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Fri, 13 Mar 2026 20:46:03 +0100
Subject: [PATCH 4/5] Remove handoffs scoring from the evals

Signed-off-by: David Gageot <david.gageot@docker.com>
---
 docs/features/evaluation/index.md |   2 -
 pkg/evaluation/eval.go            |   2 -
 pkg/evaluation/eval_test.go       | 140 +++++++++---------------------
 pkg/evaluation/scoring.go         |  16 ----
 pkg/evaluation/types.go           |  10 ---
 5 files changed, 41 insertions(+), 129 deletions(-)

diff --git a/docs/features/evaluation/index.md b/docs/features/evaluation/index.md
index 48a07d6e5..68acd4ecb 100644
--- a/docs/features/evaluation/index.md
+++ b/docs/features/evaluation/index.md
@@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions:
 | **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. |
 | **Relevance**       | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response.  |
 | **Size**            | Whether the response length matches the expected size category (S/M/L/XL).                                                |
-| **Handoffs**        | For multi-agent configs, whether task delegation matched the expected agent handoff pattern.                              |
 
 ## Creating Eval Sessions
 
@@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals
 Summary: 2/2 passed
   Sizes:      0/0
   Tool Calls: avg F1 1.00 (2 evals)
-  Handoffs:   2/2
   Relevance:  3/3
 
 Sessions DB: ./evals/results/happy-panda-1234.db
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
index d1202cf37..37d912891 100644
--- a/pkg/evaluation/eval.go
+++ b/pkg/evaluation/eval.go
@@ -350,8 +350,6 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
 
-	result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
-
 	if r.judge != nil && len(evals.Relevance) > 0 {
 		// Use transcript for relevance checking to preserve temporal ordering
 		transcript := buildTranscript(events)
diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go
index 6cb593508..ed020d708 100644
--- a/pkg/evaluation/eval_test.go
+++ b/pkg/evaluation/eval_test.go
@@ -126,30 +126,6 @@ func TestGetResponseSize(t *testing.T) {
 	}
 }
 
-func TestCountHandoffs(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		name      string
-		toolCalls []string
-		want      int
-	}{
-		{"no tool calls", []string{}, 0},
-		{"no handoffs", []string{"search", "read_file"}, 0},
-		{"one handoff", []string{"handoff", "read_file"}, 1},
-		{"one transfer_task", []string{"transfer_task", "read_file"}, 0},
-		{"multiple handoffs", []string{"handoff", "transfer_task", "handoff"}, 2},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			t.Parallel()
-			got := countHandoffs(tt.toolCalls)
-			assert.Equal(t, tt.want, got)
-		})
-	}
-}
-
 func TestParseJudgeResponse(t *testing.T) {
 	t.Parallel()
 
@@ -202,32 +178,26 @@ func TestResultCheckResults(t *testing.T) {
 		},
 		{
 			name:         "all checks pass",
-			result:       Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
-			wantSuccess:  []string{"size M", "tool calls", "handoffs", "relevance 2/2"},
+			result:       Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, RelevanceExpected: 2, RelevancePassed: 2},
+			wantSuccess:  []string{"size M", "tool calls", "relevance 2/2"},
 			wantFailures: nil,
 		},
 		{
 			name:         "size mismatch",
-			result:       Result{SizeExpected: "M", Size: "S", HandoffsMatch: true},
-			wantSuccess:  []string{"handoffs"},
+			result:       Result{SizeExpected: "M", Size: "S"},
+			wantSuccess:  nil,
 			wantFailures: []string{"size expected M, got S"},
 		},
 		{
 			name:         "tool calls failed",
-			result:       Result{ToolCallsExpected: 1, ToolCallsScore: 0.5, HandoffsMatch: true},
-			wantSuccess:  []string{"handoffs"},
-			wantFailures: []string{"tool calls score 0.50"},
-		},
-		{
-			name:         "handoffs mismatch",
-			result:       Result{HandoffsMatch: false},
+			result:       Result{ToolCallsExpected: 1, ToolCallsScore: 0.5},
 			wantSuccess:  nil,
-			wantFailures: []string{"handoffs mismatch"},
+			wantFailures: []string{"tool calls score 0.50"},
 		},
 		{
 			name:         "relevance failures listed",
-			result:       Result{HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
-			wantSuccess:  []string{"handoffs"},
+			result:       Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
+			wantSuccess:  nil,
 			wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
 		},
 	}
@@ -252,73 +222,61 @@ func TestComputeSummary(t *testing.T) {
 		wantTotalEvals     int
 		wantSizesPassed    int
 		wantSizesTotal     int
-		wantHandoffs       int
-		wantHandoffsTotal  int
 		wantRelevance      float64
 		wantRelevanceTotal float64
 	}{
 		{
-			name:              "no results",
-			results:           []Result{},
-			wantTotalCost:     0,
-			wantTotalEvals:    0,
-			wantSizesPassed:   0,
-			wantSizesTotal:    0,
-			wantHandoffs:      0,
-			wantHandoffsTotal: 0,
+			name:            "no results",
+			results:         []Result{},
+			wantTotalCost:   0,
+			wantTotalEvals:  0,
+			wantSizesPassed: 0,
+			wantSizesTotal:  0,
 		},
 		{
 			name: "all passed",
 			results: []Result{
 				{
-					Title:         "session1",
-					Cost:          0.01,
-					SizeExpected:  "M",
-					Size:          "M",
-					HandoffsMatch: true,
+					Title:        "session1",
+					Cost:         0.01,
+					SizeExpected: "M",
+					Size:         "M",
 				},
 			},
-			wantTotalCost:     0.01,
-			wantTotalEvals:    1,
-			wantSizesPassed:   1,
-			wantSizesTotal:    1,
-			wantHandoffs:      1,
-			wantHandoffsTotal: 1,
+			wantTotalCost:   0.01,
+			wantTotalEvals:  1,
+			wantSizesPassed: 1,
+			wantSizesTotal:  1,
 		},
 		{
 			name: "size mismatch",
 			results: []Result{
 				{
-					Title:         "session1",
-					SizeExpected:  "M",
-					Size:          "S",
-					HandoffsMatch: true,
+					Title:        "session1",
+					SizeExpected: "M",
+					Size:         "S",
 				},
 			},
-			wantTotalEvals:    1,
-			wantSizesPassed:   0,
-			wantSizesTotal:    1,
-			wantHandoffs:      1,
-			wantHandoffsTotal: 1,
+			wantTotalEvals:  1,
+			wantSizesPassed: 0,
+			wantSizesTotal:  1,
 		},
 		{
 			name: "multiple sessions",
 			results: []Result{
-				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true},
-				{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S", HandoffsMatch: false},
-				{Title: "session3", Cost: 0.03, HandoffsMatch: true},
+				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M"},
+				{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S"},
+				{Title: "session3", Cost: 0.03},
 			},
-			wantTotalCost:     0.06,
-			wantTotalEvals:    3,
-			wantSizesPassed:   1,
-			wantSizesTotal:    2,
-			wantHandoffs:      2,
-			wantHandoffsTotal: 3,
+			wantTotalCost:   0.06,
+			wantTotalEvals:  3,
+			wantSizesPassed: 1,
+			wantSizesTotal:  2,
 		},
 		{
 			name: "errored results excluded from totals",
 			results: []Result{
-				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
+				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", RelevanceExpected: 2, RelevancePassed: 2},
 				{Title: "session2", Cost: 0.02, Error: "docker build failed", SizeExpected: "L", RelevanceExpected: 2},
 				{Title: "session3", Cost: 0.00, Error: "timeout", RelevanceExpected: 3},
 			},
@@ -326,8 +284,6 @@ func TestComputeSummary(t *testing.T) {
 			wantTotalEvals:     3,
 			wantSizesPassed:    1,
 			wantSizesTotal:     1, // only non-errored results count
-			wantHandoffs:       1,
-			wantHandoffsTotal:  1, // only non-errored results count
 			wantRelevance:      2,
 			wantRelevanceTotal: 2, // only non-errored results count
 		},
@@ -341,8 +297,6 @@ func TestComputeSummary(t *testing.T) {
 			assert.InDelta(t, tt.wantTotalCost, summary.TotalCost, 0.0001)
 			assert.Equal(t, tt.wantSizesPassed, summary.SizesPassed)
 			assert.Equal(t, tt.wantSizesTotal, summary.SizesTotal)
-			assert.Equal(t, tt.wantHandoffs, summary.HandoffsPassed)
-			assert.Equal(t, tt.wantHandoffsTotal, summary.HandoffsTotal)
 			assert.InDelta(t, tt.wantRelevance, summary.RelevancePassed, 0.0001)
 			assert.InDelta(t, tt.wantRelevanceTotal, summary.RelevanceTotal, 0.0001)
 		})
@@ -377,14 +331,12 @@ func TestSaveRunJSON(t *testing.T) {
 		Timestamp: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC),
 		Duration:  5 * time.Minute,
 		Results: []Result{
-			{Title: "test1", Cost: 0.01, HandoffsMatch: true},
+			{Title: "test1", Cost: 0.01},
 			{Title: "test2", Cost: 0.02, Error: "failed"},
 		},
 		Summary: Summary{
-			TotalEvals:     2,
-			TotalCost:      0.03,
-			HandoffsPassed: 1,
-			HandoffsTotal:  1,
+			TotalEvals: 2,
+			TotalCost:  0.03,
 		},
 	}
 
@@ -581,15 +533,12 @@ func TestPrintSummary(t *testing.T) {
 				TotalEvals:      10,
 				FailedEvals:     5,
 				TotalCost:       0.05,
-				HandoffsPassed:  3,
-				HandoffsTotal:   5,
 				RelevancePassed: 8,
 				RelevanceTotal:  10,
 			},
 			duration: 2 * time.Minute,
 			wantContains: []string{
 				"Errors: 5/10 evaluations failed",
-				"Handoffs: 3/5 passed",
 				"Relevance: 8/10 passed",
 				"Total Cost: $0.050000",
 				"Total Time: 2m0s",
@@ -602,15 +551,12 @@ func TestPrintSummary(t *testing.T) {
 				TotalCost:       0.1,
 				SizesPassed:     4,
 				SizesTotal:      5,
-				HandoffsPassed:  5,
-				HandoffsTotal:   5,
 				RelevancePassed: 10,
 				RelevanceTotal:  10,
 			},
 			duration: 1 * time.Minute,
 			wantContains: []string{
 				"Sizes: 4/5 passed",
-				"Handoffs: 5/5 passed",
 				"Relevance: 10/10 passed",
 				"Total Cost: $0.100000",
 			},
@@ -683,14 +629,12 @@ func TestProgressBarPrintResult(t *testing.T) {
 		{
 			name: "successful result",
 			result: Result{
-				Title:         "test-session",
-				Cost:          0.005,
-				HandoffsMatch: true,
+				Title: "test-session",
+				Cost:  0.005,
 			},
 			wantContains: []string{
 				"✓ test-session",
 				"$0.005000",
-				"✓ handoffs",
 			},
 		},
 		{
@@ -712,14 +656,12 @@ func TestProgressBarPrintResult(t *testing.T) {
 				Cost:              0.01,
 				SizeExpected:      "M",
 				Size:              "S",
-				HandoffsMatch:     true,
 				RelevanceExpected: 2,
 				RelevancePassed:   1,
 				FailedRelevance:   []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
 			},
 			wantContains: []string{
 				"✗ mixed-session", // overall failed
-				"✓ handoffs",
 				"✗ size expected M, got S",
 				"✗ relevance: check failed (reason: did not meet criteria)",
 			},
diff --git a/pkg/evaluation/scoring.go b/pkg/evaluation/scoring.go
index 5d533d83c..4d686533f 100644
--- a/pkg/evaluation/scoring.go
+++ b/pkg/evaluation/scoring.go
@@ -62,16 +62,6 @@ func countStrings(strs []string) map[string]int {
 	return counts
 }
 
-func countHandoffs(toolCalls []string) int {
-	count := 0
-	for _, name := range toolCalls {
-		if name == "handoff" {
-			count++
-		}
-	}
-	return count
-}
-
 func computeSummary(results []Result) Summary {
 	summary := Summary{
 		TotalEvals: len(results),
@@ -96,11 +86,6 @@ func computeSummary(results []Result) Summary {
 			summary.ToolsCount++
 		}
 
-		summary.HandoffsTotal++
-		if r.HandoffsMatch {
-			summary.HandoffsPassed++
-		}
-
 		summary.RelevanceTotal += r.RelevanceExpected
 		summary.RelevancePassed += r.RelevancePassed
 	}
@@ -118,7 +103,6 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) {
 
 	printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal)
 	printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount)
-	printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal)
 	printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal))
 
 	fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost)
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
index 8e5ac5395..fbe0c9bdb 100644
--- a/pkg/evaluation/types.go
+++ b/pkg/evaluation/types.go
@@ -25,7 +25,6 @@ type Result struct {
 	SizeExpected      string            `json:"size_expected"`
 	ToolCallsScore    float64           `json:"tool_calls_score"`
 	ToolCallsExpected float64           `json:"tool_calls_score_expected"`
-	HandoffsMatch     bool              `json:"handoffs"`
 	RelevancePassed   float64           `json:"relevance"`
 	RelevanceExpected float64           `json:"relevance_expected"`
 	FailedRelevance   []RelevanceResult `json:"failed_relevance,omitempty"`
@@ -58,13 +57,6 @@ func (r *Result) checkResults() (successes, failures []string) {
 		}
 	}
 
-	// Check handoffs
-	if r.HandoffsMatch {
-		successes = append(successes, "handoffs")
-	} else {
-		failures = append(failures, "handoffs mismatch")
-	}
-
 	// Check relevance
 	if r.RelevanceExpected > 0 {
 		if r.RelevancePassed >= r.RelevanceExpected {
@@ -92,8 +84,6 @@ type Summary struct {
 	SizesTotal      int     `json:"sizes_total"`
 	ToolsF1Sum      float64 `json:"tools_f1_sum"`
 	ToolsCount      int     `json:"tools_count"`
-	HandoffsPassed  int     `json:"handoffs_passed"`
-	HandoffsTotal   int     `json:"handoffs_total"`
 	RelevancePassed float64 `json:"relevance_passed"`
 	RelevanceTotal  float64 `json:"relevance_total"`
 }

From e8e5ba6d82289068a2a01a375e4a9f9b8fd05091 Mon Sep 17 00:00:00 2001
From: David Gageot <david.gageot@docker.com>
Date: Fri, 13 Mar 2026 21:58:40 +0100
Subject: [PATCH 5/5] Disallow unknown fields

Signed-off-by: David Gageot <david.gageot@docker.com>
---
 pkg/session/session.go      | 17 ++++++++++++++
 pkg/session/session_test.go | 47 +++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/pkg/session/session.go b/pkg/session/session.go
index ef871b8c8..eac2a5462 100644
--- a/pkg/session/session.go
+++ b/pkg/session/session.go
@@ -1,6 +1,8 @@
 package session
 
 import (
+	"bytes"
+	"encoding/json"
 	"log/slog"
 	"os"
 	"slices"
@@ -217,6 +219,21 @@ type EvalCriteria struct {
 	Setup      string   `json:"setup,omitempty"`       // Optional sh script to run in the container before docker agent run --exec
 }
 
+// UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that
+// rejects unknown fields. This ensures eval JSON files don't contain typos
+// or unsupported fields that would be silently ignored.
+func (e *EvalCriteria) UnmarshalJSON(data []byte) error {
+	type evalCriteria EvalCriteria // alias to avoid infinite recursion
+	var v evalCriteria
+	dec := json.NewDecoder(bytes.NewReader(data))
+	dec.DisallowUnknownFields()
+	if err := dec.Decode(&v); err != nil {
+		return err
+	}
+	*e = EvalCriteria(v)
+	return nil
+}
+
 // deepCopyMessage returns a deep copy of a session Message.
 // It copies the inner chat.Message's slice and pointer fields so that the
 // returned value shares no mutable state with the original.
diff --git a/pkg/session/session_test.go b/pkg/session/session_test.go
index 724294c5c..e326c4a3e 100644
--- a/pkg/session/session_test.go
+++ b/pkg/session/session_test.go
@@ -1,6 +1,7 @@
 package session
 
 import (
+	"encoding/json"
 	"strings"
 	"testing"
 
@@ -292,6 +293,52 @@ func TestGetLastUserMessages(t *testing.T) {
 	})
 }
 
+func TestEvalCriteriaUnmarshalJSON(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name    string
+		input   string
+		want    EvalCriteria
+		wantErr bool
+	}{
+		{
+			name:  "valid fields",
+			input: `{"relevance":["is correct"],"size":"M","setup":"echo hello","working_dir":"mydir"}`,
+			want: EvalCriteria{
+				Relevance:  []string{"is correct"},
+				Size:       "M",
+				Setup:      "echo hello",
+				WorkingDir: "mydir",
+			},
+		},
+		{
+			name:  "empty object",
+			input: `{}`,
+			want:  EvalCriteria{},
+		},
+		{
+			name:    "unknown field rejected",
+			input:   `{"relevance":[],"unknown_field":"value"}`,
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			var got EvalCriteria
+			err := json.Unmarshal([]byte(tt.input), &got)
+			if tt.wantErr {
+				require.Error(t, err)
+				return
+			}
+			require.NoError(t, err)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
 func TestTransferTaskPromptExcludesParents(t *testing.T) {
 	t.Parallel()