Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/features/evaluation/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions:
| **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. |
| **Relevance** | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response. |
| **Size** | Whether the response length matches the expected size category (S/M/L/XL). |
| **Handoffs** | For multi-agent configs, whether task delegation matched the expected agent handoff pattern. |

## Creating Eval Sessions

Expand Down Expand Up @@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals
Summary: 2/2 passed
Sizes: 0/0
Tool Calls: avg F1 1.00 (2 evals)
Handoffs: 2/2
Relevance: 3/3

Sessions DB: ./evals/results/happy-panda-1234.db
Expand Down
39 changes: 31 additions & 8 deletions pkg/evaluation/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ type Runner struct {
func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner {
var judge *Judge
if judgeModel != nil {
judge = NewJudge(judgeModel, runConfig, cfg.Concurrency)
judge = NewJudge(judgeModel, cfg.Concurrency)
}
return &Runner{
Config: cfg,
Expand Down Expand Up @@ -117,6 +117,20 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
return nil, fmt.Errorf("loading evaluations: %w", err)
}

// Check whether any evaluations require relevance checking.
// If so, the judge must be configured and working; validate eagerly
// to fail fast on configuration issues (bad API key, wrong model, etc.)
// instead of silently producing zero-relevance results.
if needsJudge(evals) {
if r.judge == nil {
return nil, errors.New("some evaluations have relevance criteria but no judge model is configured (use --judge-model)")
}
fmt.Fprintln(out, "Validating judge model...")
if err := r.judge.Validate(ctx); err != nil {
return nil, fmt.Errorf("%w", err)
}
}

// Pre-build all unique Docker images in parallel before running evaluations.
// This avoids serialized builds when multiple workers need the same image.
if err := r.preBuildImages(ctx, out, evals); err != nil {
Expand Down Expand Up @@ -336,17 +350,15 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
}

result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)

if r.judge != nil && len(evals.Relevance) > 0 {
// Use transcript for relevance checking to preserve temporal ordering
transcript := buildTranscript(events)
passed, failed, errs := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
if err != nil {
return result, fmt.Errorf("relevance check failed: %w", err)
}
result.RelevancePassed = float64(passed)
result.FailedRelevance = failed
for _, e := range errs {
slog.Warn("Relevance check error", "title", evalSess.Title, "error", e)
}
}

slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
Expand Down Expand Up @@ -590,6 +602,14 @@ func matchesAnyPattern(name string, patterns []string) bool {
})
}

// needsJudge returns true if any evaluation session has relevance criteria,
// meaning a judge model is required to evaluate them.
func needsJudge(evals []InputSession) bool {
return slices.ContainsFunc(evals, func(s InputSession) bool {
return s.Evals != nil && len(s.Evals.Relevance) > 0
})
}

// createJudgeModel creates a provider.Provider from a model string (format: provider/model).
// Returns nil if judgeModel is empty.
func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.RuntimeConfig) (provider.Provider, error) {
Expand All @@ -602,7 +622,10 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
return nil, fmt.Errorf("invalid judge model format %q: expected 'provider/model'", judgeModel)
}

var opts []options.Opt
opts := []options.Opt{
options.WithThinking(false),
options.WithStructuredOutput(judgeResponseSchema),
}
if runConfig.ModelsGateway != "" {
opts = append(opts, options.WithGateway(runConfig.ModelsGateway))
}
Expand Down
Loading
Loading