Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 112 additions & 2 deletions eng/skill-validator/src/Commands/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ public static RootCommand Create()
var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true };
var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" };
var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" };
var selectivityTestOpt = new Option<bool>("--selectivity-test") { Description = "Run selectivity test using should_activate / should_not_activate prompts from eval.yaml" };
var selectivityMinRecallOpt = new Option<double>("--selectivity-min-recall") { Description = "Minimum recall (activation on should_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 };
var selectivityMinPrecisionOpt = new Option<double>("--selectivity-min-precision") { Description = "Minimum precision (non-activation on should_not_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 };
Comment on lines +34 to +35
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The --selectivity-min-recall and --selectivity-min-precision CLI options accept any double value but there is no validation that they fall in the range [0, 1]. Values outside this range (e.g., --selectivity-min-recall 1.5) would cause the test to always fail, or negative values would cause it to always pass. The existing --min-improvement option also lacks this validation, but for a new feature this is a good opportunity to add a clear guard or error message for out-of-range inputs.

Copilot uses AI. Check for mistakes.

var command = new RootCommand("Validate that agent skills meaningfully improve agent performance")
{
Expand All @@ -53,6 +56,9 @@ public static RootCommand Create()
reporterOpt,
noOverfittingCheckOpt,
overfittingFixOpt,
selectivityTestOpt,
selectivityMinRecallOpt,
selectivityMinPrecisionOpt,
};

command.SetAction(async (parseResult, _) =>
Expand Down Expand Up @@ -98,6 +104,9 @@ public static RootCommand Create()
TestsDir = parseResult.GetValue(testsDirOpt),
OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt),
OverfittingFix = parseResult.GetValue(overfittingFixOpt),
SelectivityTest = parseResult.GetValue(selectivityTestOpt),
SelectivityMinRecall = parseResult.GetValue(selectivityMinRecallOpt),
SelectivityMinPrecision = parseResult.GetValue(selectivityMinPrecisionOpt),
};

return await Run(config);
Expand Down Expand Up @@ -333,6 +342,36 @@ internal static List<string> CheckAggregateDescriptionLimits(IReadOnlyList<Skill
};
}

// Selectivity-only mode: skip full evaluation, just probe skill activation
if (config.SelectivityTest)
{
if (skill.EvalConfig is not null
&& (skill.EvalConfig.ShouldActivatePrompts is { Count: > 0 } || skill.EvalConfig.ShouldNotActivatePrompts is { Count: > 0 }))
{
log("🎯 Running selectivity test (standalone)...");
var selectivityResult = await ExecuteSelectivityTest(skill, config, spinner);
log($"🎯 Selectivity: recall={selectivityResult.Recall:P0}, precision={selectivityResult.Precision:P0} β€” {(selectivityResult.Passed ? "PASSED" : "FAILED")}");

return new SkillVerdict
{
SkillName = skill.Name,
SkillPath = skill.Path,
Passed = selectivityResult.Passed,
Scenarios = [],
OverallImprovementScore = 0,
Reason = selectivityResult.Passed
? "Selectivity test passed"
: $"Selectivity test failed: {selectivityResult.Reason}",
FailureKind = selectivityResult.Passed ? null : "selectivity_failure",
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The selectivity_failure FailureKind is not mentioned in the --verdict-warn-only blocking logic (ValidateCommand.cs lines 234-236). Currently, only missing_eval and spec_conformance_failure are treated as always-blocking failures in warn-only mode. A selectivity test failure will be silently suppressed when --verdict-warn-only is used, even if the developer intends it to be fatal.

Whether this is intentional should be documented or, if selectivity failures should block, "selectivity_failure" should be added alongside the other two blocking failure kinds.

Copilot uses AI. Check for mistakes.
ProfileWarnings = profile.Warnings,
SelectivityResult = selectivityResult,
};
}

log("⏭ Skipping (no selectivity prompts in eval.yaml)");
return null;
}

// Launch overfitting check in parallel with scenario execution
var workDir = Path.GetTempPath();
Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null);
Expand Down Expand Up @@ -496,8 +535,8 @@ private static async Task<RunExecutionResult> ExecuteRun(
runLog("running agents...");

var agentTasks = await Task.WhenAll(
AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)),
AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog)));
AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, Log: runLog)),
AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, Log: runLog)));
var baselineMetrics = agentTasks[0];
var withSkillMetrics = agentTasks[1];

Expand Down Expand Up @@ -642,4 +681,75 @@ private static string SanitizeErrorMessage(string? message)
var singleLine = raw.ReplaceLineEndings(" ");
return singleLine.Length > 150 ? singleLine[..150] + "…" : singleLine;
}

private static async Task<SelectivityResult> ExecuteSelectivityTest(SkillInfo skill, ValidatorConfig config, Spinner spinner)
{
var prefix = $"[{skill.Name}/selectivity]";
var log = (string msg) => spinner.Log($"{prefix} {msg}");

// Launch all probes in parallel
var tasks = new List<Task<SelectivityPromptResult>>();

if (skill.EvalConfig!.ShouldActivatePrompts is { } activatePrompts)
{
foreach (var prompt in activatePrompts)
{
log($"Testing should_activate: \"{Truncate(prompt, 60)}\"");
tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: true, config, log));
}
}

if (skill.EvalConfig.ShouldNotActivatePrompts is { } deactivatePrompts)
{
foreach (var prompt in deactivatePrompts)
{
log($"Testing should_not_activate: \"{Truncate(prompt, 60)}\"");
tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: false, config, log));
}
}

var promptResults = (await Task.WhenAll(tasks)).ToList();

// Calculate recall: fraction of should_activate prompts that actually activated
var shouldActivateResults = promptResults.Where(r => r.ExpectedActivation).ToList();
double recall = shouldActivateResults.Count > 0
? (double)shouldActivateResults.Count(r => r.SkillActivated) / shouldActivateResults.Count
: 1.0;

// Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate
var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList();
double precision = shouldNotActivateResults.Count > 0
? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count
: 1.0;

bool passed = recall >= config.SelectivityMinRecall && precision >= config.SelectivityMinPrecision;
var reasons = new List<string>();
if (recall < config.SelectivityMinRecall)
reasons.Add($"Recall {recall:P0} below threshold {config.SelectivityMinRecall:P0}");
if (precision < config.SelectivityMinPrecision)
reasons.Add($"Precision {precision:P0} below threshold {config.SelectivityMinPrecision:P0}");
string reason = passed ? "Selectivity test passed" : string.Join("; ", reasons);

return new SelectivityResult(promptResults, recall, precision, passed, reason);
}

private static async Task<SelectivityPromptResult> ProbeAndLog(
SkillInfo skill, string prompt, bool expectedActivation, ValidatorConfig config, Action<string> log)
{
var activated = await TestSkillActivation(skill, prompt, config);
if (expectedActivation)
log($" β†’ {(activated ? "βœ… activated" : "❌ NOT activated")}: \"{Truncate(prompt, 50)}\"");
else
log($" β†’ {(activated ? "❌ activated (unexpected)" : "βœ… correctly NOT activated")}: \"{Truncate(prompt, 50)}\"");
return new SelectivityPromptResult(prompt, ExpectedActivation: expectedActivation, SkillActivated: activated);
}

private static async Task<bool> TestSkillActivation(SkillInfo skill, string prompt, ValidatorConfig config)
{
var scenario = new EvalScenario(Name: "selectivity-probe", Prompt: prompt, Rubric: [], Timeout: 15);
return await AgentRunner.ProbeSkillActivation(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose));
}

private static string Truncate(string value, int maxLength) =>
value.Length <= maxLength ? value : value[..(maxLength - 1)] + "…";
}
23 changes: 22 additions & 1 deletion eng/skill-validator/src/Models/Models.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ public sealed record EvalScenario(
int? MaxTokens = null,
bool ExpectActivation = true);

public sealed record EvalConfig(IReadOnlyList<EvalScenario> Scenarios);
public sealed record EvalConfig(
IReadOnlyList<EvalScenario> Scenarios,
IReadOnlyList<string>? ShouldActivatePrompts = null,
IReadOnlyList<string>? ShouldNotActivatePrompts = null);

// --- Skill info ---

Expand Down Expand Up @@ -227,6 +230,7 @@ public sealed class SkillVerdict
public IReadOnlyList<string>? ProfileWarnings { get; set; }
public bool SkillNotActivated { get; set; }
public OverfittingResult? OverfittingResult { get; set; }
public SelectivityResult? SelectivityResult { get; set; }
}

// --- Overfitting assessment ---
Expand Down Expand Up @@ -274,6 +278,20 @@ public sealed record OverfittingJudgeOptions(
int Timeout,
string WorkDir);

// --- Selectivity test ---

public sealed record SelectivityPromptResult(
string Prompt,
bool ExpectedActivation,
bool SkillActivated);

public sealed record SelectivityResult(
IReadOnlyList<SelectivityPromptResult> PromptResults,
double Recall,
double Precision,
bool Passed,
string Reason);

// --- Config ---

public sealed record ReporterSpec(ReporterType Type);
Expand Down Expand Up @@ -308,6 +326,9 @@ public sealed record ValidatorConfig
public string? TestsDir { get; init; }
public bool OverfittingCheck { get; init; } = true;
public bool OverfittingFix { get; init; }
public bool SelectivityTest { get; init; }
public double SelectivityMinRecall { get; init; } = 0.8;
public double SelectivityMinPrecision { get; init; } = 0.8;
}

public static class DefaultWeights
Expand Down
71 changes: 71 additions & 0 deletions eng/skill-validator/src/Services/AgentRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,77 @@ public static async Task<RunMetrics> RunAgent(RunOptions options)
return metrics;
}

/// <summary>
/// Lightweight probe that sends a prompt and checks whether the skill is activated.
/// Exits immediately when a SkillInvokedEvent is seen, or waits for the session to
/// complete/timeout. Designed to run many probes in parallel via Task.WhenAll.
/// </summary>
public static async Task<bool> ProbeSkillActivation(RunOptions options)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
_workDirs.Add(workDir);

if (options.Verbose)
{
var write = options.Log ?? (msg => Console.Error.WriteLine(msg));
write($" πŸ“‚ {workDir} (skilled)");
}

bool skillActivated = false;
var done = new TaskCompletionSource<bool>();

try
{
var client = await GetSharedClient(options.Verbose);
await using var session = await client.CreateSessionAsync(
BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers));

// 30s timeout β€” enough for the agent to reach the skill-loading decision
using var cts = new CancellationTokenSource(30_000);
Comment on lines +316 to +317
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In TestSkillActivation, the EvalScenario is created with Timeout: 15 (seconds), but ProbeSkillActivation ignores options.Scenario.Timeout and always uses a hardcoded 30-second CancellationTokenSource (line 317). The Scenario.Timeout value is therefore dead code for probe runs β€” callers have no way to configure the actual timeout.

Either ProbeSkillActivation should respect options.Scenario.Timeout * 1000 to give callers control over the timeout, or the Timeout parameter in the EvalScenario constructor call in TestSkillActivation should be removed/documented as unused for probe runs.

Copilot uses AI. Check for mistakes.
cts.Token.Register(() => done.TrySetResult(skillActivated));

session.On(evt =>
{
switch (evt)
{
// Skill loaded β†’ we have our answer, bail immediately
case SkillInvokedEvent:
skillActivated = true;
done.TrySetResult(true);
Comment on lines +317 to +327
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In ProbeSkillActivation, the bool skillActivated local variable is read by the CancellationToken registration callback (line 318: () => done.TrySetResult(skillActivated)) and written by the session.On event handler callback (line 326: skillActivated = true). These callbacks may run on different threads concurrently, making this a data race β€” C# does not guarantee atomic reads/writes of bool across threads.

If the CancellationToken fires at the same instant as a SkillInvokedEvent arrives, the timeout callback could read a stale false for skillActivated while the event handler writes true, silently returning a false negative.

The fix is to use Volatile.Read/Volatile.Write or an int field with Interlocked.Exchange(ref skillActivated, 1) to make the access atomic and visible across threads.

Copilot uses AI. Check for mistakes.
break;

// Session finished without loading the skill β†’ not activated
case SessionIdleEvent:
done.TrySetResult(skillActivated);
break;

case SessionErrorEvent err:
done.TrySetException(new InvalidOperationException(err.Data.Message ?? "Session error"));
break;
}

if (options.Verbose && evt is SkillInvokedEvent si)
{
var write = options.Log ?? (m => Console.Error.WriteLine(m));
write($" πŸ“˜ Skill invoked: {si.Data.Name}");
}
if (options.Verbose && evt is ToolExecutionStartEvent ts)
{
var write = options.Log ?? (m => Console.Error.WriteLine(m));
write($" πŸ”§ {ts.Data.ToolName}");
}
});

await session.SendAsync(new MessageOptions { Prompt = options.Scenario.Prompt });
return await done.Task;
Comment on lines +310 to +353
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In ProbeSkillActivation, when the done.Task completes early (e.g., a SkillInvokedEvent fires and done.TrySetResult(true) is called before the session finishes), the method returns immediately from return await done.Task. The await using var session block then runs its DisposeAsync while the agent session may still be streaming events, which can lead to unexpected session teardown mid-stream.

This pattern differs from the existing RunAgent method, which always awaits done.Task for a proper session idle/error signal before the session is disposed. For ProbeSkillActivation, since early exit is intentional and desirable, the disposal at that point is intentional β€” however, this should be noted as a potential source of noise (the session teardown may log errors or emit events that appear in verbose output even after the desired result is obtained). Consider documenting or adding a try/catch around the DisposeAsync path if session teardown errors are expected here.

Copilot uses AI. Check for mistakes.
}
catch
{
return skillActivated;
}
}

private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
Expand Down
11 changes: 10 additions & 1 deletion eng/skill-validator/src/Services/EvalSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public static EvalConfig ParseEvalConfig(string yamlContent)
if (scenarios is not { Count: > 0 })
throw new InvalidOperationException("Eval config must have at least one scenario");

return new EvalConfig(scenarios);
return new EvalConfig(scenarios, raw.Selectivity?.ShouldActivate, raw.Selectivity?.ShouldNotActivate);
}

public static (bool Success, EvalConfig? Data, IReadOnlyList<string>? Errors) ValidateEvalConfig(string yamlContent)
Expand Down Expand Up @@ -122,6 +122,15 @@ internal sealed class RawFrontmatter
internal sealed class RawEvalConfig
{
public List<RawScenario>? Scenarios { get; set; }
public RawSelectivity? Selectivity { get; set; }
}

internal sealed class RawSelectivity
{
[YamlMember(Alias = "should_activate")]
public List<string>? ShouldActivate { get; set; }
[YamlMember(Alias = "should_not_activate")]
public List<string>? ShouldNotActivate { get; set; }
}
Comment on lines +128 to 134
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The EvalSchema.ParseEvalConfig is tested in EvalSchemaTests.cs, but there are no tests covering the new selectivity block parsing: parsing valid should_activate/should_not_activate lists, parsing a config that only has one of the two lists, or handling an empty selectivity block. Since EvalSchemaTests.cs already has comprehensive coverage of other schema features, this new schema element should also be tested.

Similarly, ExecuteSelectivityTest and the precision/recall calculation logic in ValidateCommand.cs are not tested. Given the codebase tests similar logic paths (e.g. in ComparatorTests.cs), test coverage of the metric calculation edge cases (e.g., empty should_activate or should_not_activate lists defaulting to 1.0) would be valuable.

Copilot uses AI. Check for mistakes.

internal sealed class RawScenario
Expand Down
51 changes: 39 additions & 12 deletions eng/skill-validator/src/Services/Reporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,34 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
{
var icon = verdict.Passed ? "\x1b[32mβœ“\x1b[0m" : "\x1b[31mβœ—\x1b[0m";
var name = $"\x1b[1m{verdict.SkillName}\x1b[0m";
var score = FormatScore(verdict.OverallImprovementScore);

var scoreLine = $"{icon} {name} {score}";
if (verdict.ConfidenceInterval is { } ci)
// Selectivity-only verdict: no scenarios or score to display
bool isSelectivityOnly = verdict.Scenarios.Count == 0 && verdict.SelectivityResult is not null;

if (isSelectivityOnly)
{
var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]";
var sigStr = verdict.IsSignificant == true
? "\x1b[32msignificant\x1b[0m"
: "\x1b[33mnot significant\x1b[0m";
scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}";
Console.WriteLine($"{icon} {name} \x1b[2m(selectivity only)\x1b[0m");
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
}
if (verdict.NormalizedGain is { } ng)
scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m";
else
{
var score = FormatScore(verdict.OverallImprovementScore);

var scoreLine = $"{icon} {name} {score}";
if (verdict.ConfidenceInterval is { } ci)
{
var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]";
var sigStr = verdict.IsSignificant == true
? "\x1b[32msignificant\x1b[0m"
: "\x1b[33mnot significant\x1b[0m";
scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}";
}
if (verdict.NormalizedGain is { } ng)
scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m";

Console.WriteLine(scoreLine);
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
Console.WriteLine(scoreLine);
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
}

if (!verdict.Passed && verdict.ProfileWarnings is { Count: > 0 })
{
Expand Down Expand Up @@ -132,6 +144,21 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
Console.WriteLine($" \x1b[2mβ€’\x1b[0m [{item.Classification}] \x1b[2m{item.AssertionSummary}\x1b[0m\n \x1b[2mβ€” {item.Reasoning}\x1b[0m");
}
}
if (verdict.SelectivityResult is { } selResult)
{
Console.WriteLine();
var selIcon = selResult.Passed ? "βœ…" : "πŸ”΄";
Console.WriteLine($" 🎯 Selectivity: recall={selResult.Recall:P0}, precision={selResult.Precision:P0} {selIcon}");
foreach (var pr in selResult.PromptResults)
{
var expected = pr.ExpectedActivation ? "should activate" : "should NOT activate";
var correct = (pr.ExpectedActivation == pr.SkillActivated);
var prIcon = correct ? "\x1b[32mβœ“\x1b[0m" : "\x1b[31mβœ—\x1b[0m";
var activatedStr = pr.SkillActivated ? "activated" : "not activated";
var prompt = pr.Prompt.Length > 60 ? pr.Prompt[..59] + "…" : pr.Prompt;
Console.WriteLine($" {prIcon} \x1b[2m\"{prompt}\" β€” {expected} β†’ {activatedStr}\x1b[0m");
}
}
if (verdict.Scenarios.Count > 0)
{
Console.WriteLine();
Expand Down
Loading
Loading