From 25f6b70792eeb6c23dee473fb5050df2f946a61a Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 13 Mar 2026 18:33:35 +0100 Subject: [PATCH 1/5] Validate llm judge Signed-off-by: David Gageot --- pkg/evaluation/eval.go | 30 ++++++++++++++++--- pkg/evaluation/eval_test.go | 57 ++++++++++++++++++++++++++++++++++++ pkg/evaluation/judge.go | 46 ++++++++++++++++++++++++----- pkg/evaluation/judge_test.go | 13 ++++---- 4 files changed, 128 insertions(+), 18 deletions(-) diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index aaf73ef38..ad1b60a48 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -117,6 +117,20 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([] return nil, fmt.Errorf("loading evaluations: %w", err) } + // Check whether any evaluations require relevance checking. + // If so, the judge must be configured and working; validate eagerly + // to fail fast on configuration issues (bad API key, wrong model, etc.) + // instead of silently producing zero-relevance results. + if needsJudge(evals) { + if r.judge == nil { + return nil, errors.New("some evaluations have relevance criteria but no judge model is configured (use --judge-model)") + } + fmt.Fprintln(out, "Validating judge model...") + if err := r.judge.Validate(ctx); err != nil { + return nil, fmt.Errorf("%w", err) + } + } + // Pre-build all unique Docker images in parallel before running evaluations. // This avoids serialized builds when multiple workers need the same image. if err := r.preBuildImages(ctx, out, evals); err != nil { @@ -341,12 +355,12 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res if r.judge != nil && len(evals.Relevance) > 0 { // Use transcript for relevance checking to preserve temporal ordering transcript := buildTranscript(events) - passed, failed, errs := r.judge.CheckRelevance(ctx, transcript, evals.Relevance) + passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance) + if err != nil { + return result, fmt.Errorf("relevance check failed: %w", err) + } result.RelevancePassed = float64(passed) result.FailedRelevance = failed - for _, e := range errs { - slog.Warn("Relevance check error", "title", evalSess.Title, "error", e) - } } slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime)) @@ -590,6 +604,14 @@ func matchesAnyPattern(name string, patterns []string) bool { }) } +// needsJudge returns true if any evaluation session has relevance criteria, +// meaning a judge model is required to evaluate them. +func needsJudge(evals []InputSession) bool { + return slices.ContainsFunc(evals, func(s InputSession) bool { + return s.Evals != nil && len(s.Evals.Relevance) > 0 + }) +} + // createJudgeModel creates a provider.Provider from a model string (format: provider/model). // Returns nil if judgeModel is empty. func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.RuntimeConfig) (provider.Provider, error) { diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go index 5fbf86607..6cb593508 100644 --- a/pkg/evaluation/eval_test.go +++ b/pkg/evaluation/eval_test.go @@ -12,6 +12,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/docker/docker-agent/pkg/session" ) func TestToolCallF1Score(t *testing.T) { @@ -1009,3 +1011,58 @@ func TestMatchesAnyPattern(t *testing.T) { }) } } + +func TestNeedsJudge(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + evals []InputSession + want bool + }{ + { + name: "no evals", + evals: nil, + want: false, + }, + { + name: "evals without relevance criteria", + evals: []InputSession{ + {Session: &session.Session{Evals: &session.EvalCriteria{Size: "M"}}}, + {Session: &session.Session{Evals: &session.EvalCriteria{}}}, + }, + want: false, + }, + { + name: "evals with nil Evals field", + evals: []InputSession{ + {Session: &session.Session{}}, + }, + want: false, + }, + { + name: "some evals with relevance criteria", + evals: []InputSession{ + {Session: &session.Session{Evals: &session.EvalCriteria{}}}, + {Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"criterion1"}}}}, + }, + want: true, + }, + { + name: "all evals with relevance criteria", + evals: []InputSession{ + {Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"a", "b"}}}}, + {Session: &session.Session{Evals: &session.EvalCriteria{Relevance: []string{"c"}}}}, + }, + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := needsJudge(tt.evals) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go index c530022c2..9aae65006 100644 --- a/pkg/evaluation/judge.go +++ b/pkg/evaluation/judge.go @@ -82,6 +82,29 @@ func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurre } } +// Validate performs an end-to-end check of the judge model by sending a +// trivial relevance prompt and verifying the response is valid structured +// JSON. This catches configuration errors (bad API key, unsupported model, +// missing structured-output support, etc.) before running any evaluations, +// allowing the framework to fail fast. +func (j *Judge) Validate(ctx context.Context) error { + const ( + testResponse = "The sky is blue." + testCriterion = "The response mentions a color." + ) + + passed, _, err := j.checkSingle(ctx, testResponse, testCriterion) + if err != nil { + return fmt.Errorf("judge model validation failed: %w", err) + } + + if !passed { + return errors.New("judge model validation failed: expected the test criterion to pass but the judge returned 'fail'") + } + + return nil +} + // RelevanceResult contains the result of a single relevance check. type RelevanceResult struct { Criterion string `json:"criterion"` @@ -89,8 +112,11 @@ type RelevanceResult struct { } // CheckRelevance runs all relevance checks concurrently with the configured concurrency. -// It returns the number of passed checks, a slice of failed results with reasons, and any errors encountered. -func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, errs []string) { +// It returns the number of passed checks, a slice of failed results with reasons, and an error +// if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard +// failure so that configuration issues are surfaced immediately rather than silently producing +// zero-relevance results. +func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []string) (passed int, failed []RelevanceResult, err error) { if len(criteria) == 0 { return 0, nil, nil } @@ -122,17 +148,19 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] results[item.index] = result{err: fmt.Errorf("context cancelled: %w", ctx.Err())} continue } - pass, reason, err := j.checkSingle(ctx, response, item.criterion) - results[item.index] = result{passed: pass, reason: reason, err: err} + pass, reason, checkErr := j.checkSingle(ctx, response, item.criterion) + results[item.index] = result{passed: pass, reason: reason, err: checkErr} } }) } wg.Wait() - // Aggregate results + // Aggregate results. Any error is fatal — return it immediately so the + // caller can fail fast on judge misconfiguration. + var errs []error for i, r := range results { if r.err != nil { - errs = append(errs, fmt.Sprintf("error checking %q: %v", criteria[i], r.err)) + errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err)) continue } if r.passed { @@ -145,7 +173,11 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] } } - return passed, failed, errs + if len(errs) > 0 { + return passed, failed, errors.Join(errs...) + } + + return passed, failed, nil } // getOrCreateJudgeWithSchema returns a provider pre-configured with structured output. diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go index fe9594a67..aba53be18 100644 --- a/pkg/evaluation/judge_test.go +++ b/pkg/evaluation/judge_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNewJudge(t *testing.T) { @@ -46,11 +47,11 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) { t.Parallel() judge := NewJudge(nil, nil, 1) - passed, failed, errs := judge.CheckRelevance(t.Context(), "some response", nil) + passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil) assert.Equal(t, 0, passed) assert.Empty(t, failed) - assert.Empty(t, errs) + assert.NoError(t, err) } func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) { @@ -62,13 +63,11 @@ func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) { cancel() // Cancel immediately criteria := []string{"criterion1", "criterion2", "criterion3"} - passed, failed, errs := judge.CheckRelevance(ctx, "some response", criteria) + passed, failed, err := judge.CheckRelevance(ctx, "some response", criteria) // All should have errors due to context cancellation assert.Equal(t, 0, passed) assert.Empty(t, failed) - assert.Len(t, errs, 3) - for _, err := range errs { - assert.Contains(t, err, "context cancelled") - } + require.Error(t, err) + assert.Contains(t, err.Error(), "context cancelled") } From 37ec4d169d3221613615daa9314112355421c1ae Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 13 Mar 2026 20:18:51 +0100 Subject: [PATCH 2/5] No thinking for llm as a judge Signed-off-by: David Gageot --- pkg/evaluation/eval.go | 4 +++- pkg/evaluation/judge.go | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index ad1b60a48..4a174bdf2 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -624,7 +624,9 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config. return nil, fmt.Errorf("invalid judge model format %q: expected 'provider/model'", judgeModel) } - var opts []options.Opt + opts := []options.Opt{ + options.WithThinking(false), + } if runConfig.ModelsGateway != "" { opts = append(opts, options.WithGateway(runConfig.ModelsGateway)) } diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go index 9aae65006..7fb41953f 100644 --- a/pkg/evaluation/judge.go +++ b/pkg/evaluation/judge.go @@ -194,6 +194,7 @@ func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provid opts := []options.Opt{ options.WithStructuredOutput(judgeResponseSchema), + options.WithThinking(false), } if j.runConfig.ModelsGateway != "" { opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway)) From f11de8a3221132fbf1b3a33d9a93131cd11f2ea4 Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 13 Mar 2026 20:23:30 +0100 Subject: [PATCH 3/5] Remove duplication around llm as a judge Signed-off-by: David Gageot --- pkg/evaluation/eval.go | 3 ++- pkg/evaluation/judge.go | 50 ++---------------------------------- pkg/evaluation/judge_test.go | 6 ++--- 3 files changed, 7 insertions(+), 52 deletions(-) diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index 4a174bdf2..d1202cf37 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -49,7 +49,7 @@ type Runner struct { func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner { var judge *Judge if judgeModel != nil { - judge = NewJudge(judgeModel, runConfig, cfg.Concurrency) + judge = NewJudge(judgeModel, cfg.Concurrency) } return &Runner{ Config: cfg, @@ -626,6 +626,7 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config. opts := []options.Opt{ options.WithThinking(false), + options.WithStructuredOutput(judgeResponseSchema), } if runConfig.ModelsGateway != "" { opts = append(opts, options.WithGateway(runConfig.ModelsGateway)) diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go index 7fb41953f..6062132ae 100644 --- a/pkg/evaluation/judge.go +++ b/pkg/evaluation/judge.go @@ -11,10 +11,8 @@ import ( "sync" "github.com/docker/docker-agent/pkg/chat" - "github.com/docker/docker-agent/pkg/config" "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/model/provider" - "github.com/docker/docker-agent/pkg/model/provider/options" ) // relevancePrompt is the prompt template for the judge model to evaluate responses. @@ -58,26 +56,17 @@ var judgeResponseSchema = &latest.StructuredOutput{ // Judge runs LLM-as-a-judge relevance checks concurrently. type Judge struct { model provider.Provider - runConfig *config.RuntimeConfig concurrency int - - // judgeWithSchema is a provider pre-configured with structured output. - // Created lazily on first use and reused across all relevance checks. - // Protected by judgeWithSchemaMu; only cached on success so that - // transient errors (e.g. context cancellation) can be retried. - judgeWithSchema provider.Provider - judgeWithSchemaMu sync.Mutex } // NewJudge creates a new Judge that runs relevance checks with the given concurrency. // Concurrency defaults to 1 if n < 1. -func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurrency int) *Judge { +func NewJudge(model provider.Provider, concurrency int) *Judge { if concurrency < 1 { concurrency = 1 } return &Judge{ model: model, - runConfig: runConfig, concurrency: concurrency, } } @@ -180,48 +169,13 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] return passed, failed, nil } -// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output. -// The provider is created once and reused across all relevance checks. -// Unlike sync.Once, transient failures (e.g. context cancellation) are not -// cached, allowing subsequent calls to retry. -func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provider, error) { - j.judgeWithSchemaMu.Lock() - defer j.judgeWithSchemaMu.Unlock() - - if j.judgeWithSchema != nil { - return j.judgeWithSchema, nil - } - - opts := []options.Opt{ - options.WithStructuredOutput(judgeResponseSchema), - options.WithThinking(false), - } - if j.runConfig.ModelsGateway != "" { - opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway)) - } - - modelCfg := j.model.BaseConfig().ModelConfig - p, err := provider.New(ctx, &modelCfg, j.runConfig.EnvProvider(), opts...) - if err != nil { - return nil, err - } - - j.judgeWithSchema = p - return j.judgeWithSchema, nil -} - // checkSingle checks a single relevance criterion against the response. // It returns whether the check passed, the reason provided by the judge, and any error. func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) { - judgeWithSchema, err := j.getOrCreateJudgeWithSchema(ctx) - if err != nil { - return false, "", fmt.Errorf("creating judge provider with structured output: %w", err) - } - prompt := fmt.Sprintf(relevancePrompt, response, criterion) messages := []chat.Message{{Role: chat.MessageRoleUser, Content: prompt}} - stream, err := judgeWithSchema.CreateChatCompletionStream(ctx, messages, nil) + stream, err := j.model.CreateChatCompletionStream(ctx, messages, nil) if err != nil { return false, "", fmt.Errorf("creating chat completion: %w", err) } diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go index aba53be18..4b54ca64e 100644 --- a/pkg/evaluation/judge_test.go +++ b/pkg/evaluation/judge_test.go @@ -37,7 +37,7 @@ func TestNewJudge(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - judge := NewJudge(nil, nil, tt.concurrency) + judge := NewJudge(nil, tt.concurrency) assert.Equal(t, tt.expectedConcurrency, judge.concurrency) }) } @@ -46,7 +46,7 @@ func TestNewJudge(t *testing.T) { func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) { t.Parallel() - judge := NewJudge(nil, nil, 1) + judge := NewJudge(nil, 1) passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil) assert.Equal(t, 0, passed) @@ -57,7 +57,7 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) { func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) { t.Parallel() - judge := NewJudge(nil, nil, 2) + judge := NewJudge(nil, 2) ctx, cancel := context.WithCancel(t.Context()) cancel() // Cancel immediately From 388ae8b9fe90bb996fa2a5faa9e980c8f60fcb95 Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 13 Mar 2026 20:46:03 +0100 Subject: [PATCH 4/5] Remove handoffs scoring from the evals Signed-off-by: David Gageot --- docs/features/evaluation/index.md | 2 - pkg/evaluation/eval.go | 2 - pkg/evaluation/eval_test.go | 140 +++++++++--------------------- pkg/evaluation/scoring.go | 16 ---- pkg/evaluation/types.go | 10 --- 5 files changed, 41 insertions(+), 129 deletions(-) diff --git a/docs/features/evaluation/index.md b/docs/features/evaluation/index.md index 48a07d6e5..68acd4ecb 100644 --- a/docs/features/evaluation/index.md +++ b/docs/features/evaluation/index.md @@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions: | **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. | | **Relevance** | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response. | | **Size** | Whether the response length matches the expected size category (S/M/L/XL). | -| **Handoffs** | For multi-agent configs, whether task delegation matched the expected agent handoff pattern. | ## Creating Eval Sessions @@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals Summary: 2/2 passed Sizes: 0/0 Tool Calls: avg F1 1.00 (2 evals) - Handoffs: 2/2 Relevance: 3/3 Sessions DB: ./evals/results/happy-panda-1234.db diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go index d1202cf37..37d912891 100644 --- a/pkg/evaluation/eval.go +++ b/pkg/evaluation/eval.go @@ -350,8 +350,6 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls) } - result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls) - if r.judge != nil && len(evals.Relevance) > 0 { // Use transcript for relevance checking to preserve temporal ordering transcript := buildTranscript(events) diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go index 6cb593508..ed020d708 100644 --- a/pkg/evaluation/eval_test.go +++ b/pkg/evaluation/eval_test.go @@ -126,30 +126,6 @@ func TestGetResponseSize(t *testing.T) { } } -func TestCountHandoffs(t *testing.T) { - t.Parallel() - - tests := []struct { - name string - toolCalls []string - want int - }{ - {"no tool calls", []string{}, 0}, - {"no handoffs", []string{"search", "read_file"}, 0}, - {"one handoff", []string{"handoff", "read_file"}, 1}, - {"one transfer_task", []string{"transfer_task", "read_file"}, 0}, - {"multiple handoffs", []string{"handoff", "transfer_task", "handoff"}, 2}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - got := countHandoffs(tt.toolCalls) - assert.Equal(t, tt.want, got) - }) - } -} - func TestParseJudgeResponse(t *testing.T) { t.Parallel() @@ -202,32 +178,26 @@ func TestResultCheckResults(t *testing.T) { }, { name: "all checks pass", - result: Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2}, - wantSuccess: []string{"size M", "tool calls", "handoffs", "relevance 2/2"}, + result: Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, RelevanceExpected: 2, RelevancePassed: 2}, + wantSuccess: []string{"size M", "tool calls", "relevance 2/2"}, wantFailures: nil, }, { name: "size mismatch", - result: Result{SizeExpected: "M", Size: "S", HandoffsMatch: true}, - wantSuccess: []string{"handoffs"}, + result: Result{SizeExpected: "M", Size: "S"}, + wantSuccess: nil, wantFailures: []string{"size expected M, got S"}, }, { name: "tool calls failed", - result: Result{ToolCallsExpected: 1, ToolCallsScore: 0.5, HandoffsMatch: true}, - wantSuccess: []string{"handoffs"}, - wantFailures: []string{"tool calls score 0.50"}, - }, - { - name: "handoffs mismatch", - result: Result{HandoffsMatch: false}, + result: Result{ToolCallsExpected: 1, ToolCallsScore: 0.5}, wantSuccess: nil, - wantFailures: []string{"handoffs mismatch"}, + wantFailures: []string{"tool calls score 0.50"}, }, { name: "relevance failures listed", - result: Result{HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}}, - wantSuccess: []string{"handoffs"}, + result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}}, + wantSuccess: nil, wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"}, }, } @@ -252,73 +222,61 @@ func TestComputeSummary(t *testing.T) { wantTotalEvals int wantSizesPassed int wantSizesTotal int - wantHandoffs int - wantHandoffsTotal int wantRelevance float64 wantRelevanceTotal float64 }{ { - name: "no results", - results: []Result{}, - wantTotalCost: 0, - wantTotalEvals: 0, - wantSizesPassed: 0, - wantSizesTotal: 0, - wantHandoffs: 0, - wantHandoffsTotal: 0, + name: "no results", + results: []Result{}, + wantTotalCost: 0, + wantTotalEvals: 0, + wantSizesPassed: 0, + wantSizesTotal: 0, }, { name: "all passed", results: []Result{ { - Title: "session1", - Cost: 0.01, - SizeExpected: "M", - Size: "M", - HandoffsMatch: true, + Title: "session1", + Cost: 0.01, + SizeExpected: "M", + Size: "M", }, }, - wantTotalCost: 0.01, - wantTotalEvals: 1, - wantSizesPassed: 1, - wantSizesTotal: 1, - wantHandoffs: 1, - wantHandoffsTotal: 1, + wantTotalCost: 0.01, + wantTotalEvals: 1, + wantSizesPassed: 1, + wantSizesTotal: 1, }, { name: "size mismatch", results: []Result{ { - Title: "session1", - SizeExpected: "M", - Size: "S", - HandoffsMatch: true, + Title: "session1", + SizeExpected: "M", + Size: "S", }, }, - wantTotalEvals: 1, - wantSizesPassed: 0, - wantSizesTotal: 1, - wantHandoffs: 1, - wantHandoffsTotal: 1, + wantTotalEvals: 1, + wantSizesPassed: 0, + wantSizesTotal: 1, }, { name: "multiple sessions", results: []Result{ - {Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true}, - {Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S", HandoffsMatch: false}, - {Title: "session3", Cost: 0.03, HandoffsMatch: true}, + {Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M"}, + {Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S"}, + {Title: "session3", Cost: 0.03}, }, - wantTotalCost: 0.06, - wantTotalEvals: 3, - wantSizesPassed: 1, - wantSizesTotal: 2, - wantHandoffs: 2, - wantHandoffsTotal: 3, + wantTotalCost: 0.06, + wantTotalEvals: 3, + wantSizesPassed: 1, + wantSizesTotal: 2, }, { name: "errored results excluded from totals", results: []Result{ - {Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2}, + {Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", RelevanceExpected: 2, RelevancePassed: 2}, {Title: "session2", Cost: 0.02, Error: "docker build failed", SizeExpected: "L", RelevanceExpected: 2}, {Title: "session3", Cost: 0.00, Error: "timeout", RelevanceExpected: 3}, }, @@ -326,8 +284,6 @@ func TestComputeSummary(t *testing.T) { wantTotalEvals: 3, wantSizesPassed: 1, wantSizesTotal: 1, // only non-errored results count - wantHandoffs: 1, - wantHandoffsTotal: 1, // only non-errored results count wantRelevance: 2, wantRelevanceTotal: 2, // only non-errored results count }, @@ -341,8 +297,6 @@ func TestComputeSummary(t *testing.T) { assert.InDelta(t, tt.wantTotalCost, summary.TotalCost, 0.0001) assert.Equal(t, tt.wantSizesPassed, summary.SizesPassed) assert.Equal(t, tt.wantSizesTotal, summary.SizesTotal) - assert.Equal(t, tt.wantHandoffs, summary.HandoffsPassed) - assert.Equal(t, tt.wantHandoffsTotal, summary.HandoffsTotal) assert.InDelta(t, tt.wantRelevance, summary.RelevancePassed, 0.0001) assert.InDelta(t, tt.wantRelevanceTotal, summary.RelevanceTotal, 0.0001) }) @@ -377,14 +331,12 @@ func TestSaveRunJSON(t *testing.T) { Timestamp: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC), Duration: 5 * time.Minute, Results: []Result{ - {Title: "test1", Cost: 0.01, HandoffsMatch: true}, + {Title: "test1", Cost: 0.01}, {Title: "test2", Cost: 0.02, Error: "failed"}, }, Summary: Summary{ - TotalEvals: 2, - TotalCost: 0.03, - HandoffsPassed: 1, - HandoffsTotal: 1, + TotalEvals: 2, + TotalCost: 0.03, }, } @@ -581,15 +533,12 @@ func TestPrintSummary(t *testing.T) { TotalEvals: 10, FailedEvals: 5, TotalCost: 0.05, - HandoffsPassed: 3, - HandoffsTotal: 5, RelevancePassed: 8, RelevanceTotal: 10, }, duration: 2 * time.Minute, wantContains: []string{ "Errors: 5/10 evaluations failed", - "Handoffs: 3/5 passed", "Relevance: 8/10 passed", "Total Cost: $0.050000", "Total Time: 2m0s", @@ -602,15 +551,12 @@ func TestPrintSummary(t *testing.T) { TotalCost: 0.1, SizesPassed: 4, SizesTotal: 5, - HandoffsPassed: 5, - HandoffsTotal: 5, RelevancePassed: 10, RelevanceTotal: 10, }, duration: 1 * time.Minute, wantContains: []string{ "Sizes: 4/5 passed", - "Handoffs: 5/5 passed", "Relevance: 10/10 passed", "Total Cost: $0.100000", }, @@ -683,14 +629,12 @@ func TestProgressBarPrintResult(t *testing.T) { { name: "successful result", result: Result{ - Title: "test-session", - Cost: 0.005, - HandoffsMatch: true, + Title: "test-session", + Cost: 0.005, }, wantContains: []string{ "✓ test-session", "$0.005000", - "✓ handoffs", }, }, { @@ -712,14 +656,12 @@ func TestProgressBarPrintResult(t *testing.T) { Cost: 0.01, SizeExpected: "M", Size: "S", - HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 1, FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}}, }, wantContains: []string{ "✗ mixed-session", // overall failed - "✓ handoffs", "✗ size expected M, got S", "✗ relevance: check failed (reason: did not meet criteria)", }, diff --git a/pkg/evaluation/scoring.go b/pkg/evaluation/scoring.go index 5d533d83c..4d686533f 100644 --- a/pkg/evaluation/scoring.go +++ b/pkg/evaluation/scoring.go @@ -62,16 +62,6 @@ func countStrings(strs []string) map[string]int { return counts } -func countHandoffs(toolCalls []string) int { - count := 0 - for _, name := range toolCalls { - if name == "handoff" { - count++ - } - } - return count -} - func computeSummary(results []Result) Summary { summary := Summary{ TotalEvals: len(results), @@ -96,11 +86,6 @@ func computeSummary(results []Result) Summary { summary.ToolsCount++ } - summary.HandoffsTotal++ - if r.HandoffsMatch { - summary.HandoffsPassed++ - } - summary.RelevanceTotal += r.RelevanceExpected summary.RelevancePassed += r.RelevancePassed } @@ -118,7 +103,6 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) { printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal) printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount) - printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal) printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal)) fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost) diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go index 8e5ac5395..fbe0c9bdb 100644 --- a/pkg/evaluation/types.go +++ b/pkg/evaluation/types.go @@ -25,7 +25,6 @@ type Result struct { SizeExpected string `json:"size_expected"` ToolCallsScore float64 `json:"tool_calls_score"` ToolCallsExpected float64 `json:"tool_calls_score_expected"` - HandoffsMatch bool `json:"handoffs"` RelevancePassed float64 `json:"relevance"` RelevanceExpected float64 `json:"relevance_expected"` FailedRelevance []RelevanceResult `json:"failed_relevance,omitempty"` @@ -58,13 +57,6 @@ func (r *Result) checkResults() (successes, failures []string) { } } - // Check handoffs - if r.HandoffsMatch { - successes = append(successes, "handoffs") - } else { - failures = append(failures, "handoffs mismatch") - } - // Check relevance if r.RelevanceExpected > 0 { if r.RelevancePassed >= r.RelevanceExpected { @@ -92,8 +84,6 @@ type Summary struct { SizesTotal int `json:"sizes_total"` ToolsF1Sum float64 `json:"tools_f1_sum"` ToolsCount int `json:"tools_count"` - HandoffsPassed int `json:"handoffs_passed"` - HandoffsTotal int `json:"handoffs_total"` RelevancePassed float64 `json:"relevance_passed"` RelevanceTotal float64 `json:"relevance_total"` } From e8e5ba6d82289068a2a01a375e4a9f9b8fd05091 Mon Sep 17 00:00:00 2001 From: David Gageot Date: Fri, 13 Mar 2026 21:58:40 +0100 Subject: [PATCH 5/5] Disallow unknown fields Signed-off-by: David Gageot --- pkg/session/session.go | 17 ++++++++++++++ pkg/session/session_test.go | 47 +++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/pkg/session/session.go b/pkg/session/session.go index ef871b8c8..eac2a5462 100644 --- a/pkg/session/session.go +++ b/pkg/session/session.go @@ -1,6 +1,8 @@ package session import ( + "bytes" + "encoding/json" "log/slog" "os" "slices" @@ -217,6 +219,21 @@ type EvalCriteria struct { Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before docker agent run --exec } +// UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that +// rejects unknown fields. This ensures eval JSON files don't contain typos +// or unsupported fields that would be silently ignored. +func (e *EvalCriteria) UnmarshalJSON(data []byte) error { + type evalCriteria EvalCriteria // alias to avoid infinite recursion + var v evalCriteria + dec := json.NewDecoder(bytes.NewReader(data)) + dec.DisallowUnknownFields() + if err := dec.Decode(&v); err != nil { + return err + } + *e = EvalCriteria(v) + return nil +} + // deepCopyMessage returns a deep copy of a session Message. // It copies the inner chat.Message's slice and pointer fields so that the // returned value shares no mutable state with the original. diff --git a/pkg/session/session_test.go b/pkg/session/session_test.go index 724294c5c..e326c4a3e 100644 --- a/pkg/session/session_test.go +++ b/pkg/session/session_test.go @@ -1,6 +1,7 @@ package session import ( + "encoding/json" "strings" "testing" @@ -292,6 +293,52 @@ func TestGetLastUserMessages(t *testing.T) { }) } +func TestEvalCriteriaUnmarshalJSON(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + want EvalCriteria + wantErr bool + }{ + { + name: "valid fields", + input: `{"relevance":["is correct"],"size":"M","setup":"echo hello","working_dir":"mydir"}`, + want: EvalCriteria{ + Relevance: []string{"is correct"}, + Size: "M", + Setup: "echo hello", + WorkingDir: "mydir", + }, + }, + { + name: "empty object", + input: `{}`, + want: EvalCriteria{}, + }, + { + name: "unknown field rejected", + input: `{"relevance":[],"unknown_field":"value"}`, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + var got EvalCriteria + err := json.Unmarshal([]byte(tt.input), &got) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + func TestTransferTaskPromptExcludesParents(t *testing.T) { t.Parallel()