From 1bfb3cddb10610d979dd5d25a4fdd9d944c149db Mon Sep 17 00:00:00 2001 From: Steve Pfister Date: Mon, 30 Mar 2026 08:37:38 -0400 Subject: [PATCH] Don't zero quality scores when pairwise judge is position-swap-inconsistent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the pairwise judge disagrees with itself across position swaps, MergeInconsistentResults defaults everything to 'tie', which PairwiseToQualityScore converts to 0. This zeroes out 70% of the weighted score (Quality 40% + Overall 30%), leaving only the efficiency penalties to determine the result. For action-oriented skills that do real work (download symbols, run tools), this means quality improvements measured by rubric scoring get discarded, and the overhead of doing valuable work pushes the score negative. Fix: only override rubric-based scores when pairwise is consistent. When inconsistent, keep the rubric-based quality signal — it's noisier but non-zero. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- eng/skill-validator/InvestigatingResults.md | 3 ++- .../src/Evaluate/Comparator.cs | 7 ++++-- .../tests/Evaluate/ComparatorTests.cs | 25 +++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/eng/skill-validator/InvestigatingResults.md b/eng/skill-validator/InvestigatingResults.md index 26e2c0f497..451c8f9489 100644 --- a/eng/skill-validator/InvestigatingResults.md +++ b/eng/skill-validator/InvestigatingResults.md @@ -221,11 +221,12 @@ Several scenario-level options in `eval.yaml` are relevant when diagnosing failu - `overallJudgmentImprovement` is -0.4 even though quality scores are similar - Pairwise judge is inconsistent between position-swapped runs -**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency, but the weighted scoring still penalizes. +**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency. When this happens, the Comparator falls back to rubric-based quality scores instead of using the (zeroed-out) pairwise scores. **Fixes:** - This is usually noise — re-run the eval to see if it persists - If it consistently happens, improve the skill to produce clearly differentiated output +- Check `pairwiseResult.positionSwapConsistent` — when `false`, the quality/overall scores come from rubric scoring, not pairwise ### 8. Baseline already good (no headroom) diff --git a/eng/skill-validator/src/Evaluate/Comparator.cs b/eng/skill-validator/src/Evaluate/Comparator.cs index 1707ae6257..cd52f2a7b4 100644 --- a/eng/skill-validator/src/Evaluate/Comparator.cs +++ b/eng/skill-validator/src/Evaluate/Comparator.cs @@ -21,8 +21,11 @@ public static ScenarioComparison CompareScenario( OverallJudgmentImprovement: NormalizeScoreImprovement(baseline.JudgeResult.OverallScore, withSkill.JudgeResult.OverallScore), ErrorReduction: ComputeReduction(baseline.Metrics.ErrorCount, withSkill.Metrics.ErrorCount)); - // Override quality scores with pairwise results when available - if (pairwiseResult is not null) + // Override quality scores with pairwise results when consistent. + // When position-swap-inconsistent, keep the rubric-based scores — + // zeroing out 70% of the score on a noisy comparison is worse than + // using the (noisier but non-zero) rubric-based quality signal. + if (pairwiseResult is not null && pairwiseResult.PositionSwapConsistent) { var pairwiseScores = PairwiseJudge.PairwiseToQualityScore(pairwiseResult); breakdown = breakdown with diff --git a/eng/skill-validator/tests/Evaluate/ComparatorTests.cs b/eng/skill-validator/tests/Evaluate/ComparatorTests.cs index e44d0c2257..f4878af19e 100644 --- a/eng/skill-validator/tests/Evaluate/ComparatorTests.cs +++ b/eng/skill-validator/tests/Evaluate/ComparatorTests.cs @@ -279,4 +279,29 @@ [new PairwiseRubricResult("Q", "skill", PairwiseMagnitude.MuchBetter, "")], Assert.Equal(1.0, withPairwise.Breakdown.OverallJudgmentImprovement); Assert.Equal(pairwise, withPairwise.PairwiseResult); } + + [Fact] + public void KeepsRubricScoresWhenPairwiseIsInconsistent() + { + var baseline = MakeRunResult(overallScore: 2, rubricScores: [new RubricScore("Q", 2, "")]); + var withSkill = MakeRunResult(overallScore: 4, rubricScores: [new RubricScore("Q", 4, "")]); + + // Without pairwise, rubric-based quality should show improvement + var noPairwise = Comparator.CompareScenario("test", baseline, withSkill); + Assert.True(noPairwise.Breakdown.QualityImprovement > 0); + Assert.True(noPairwise.Breakdown.OverallJudgmentImprovement > 0); + double rubricQuality = noPairwise.Breakdown.QualityImprovement; + double rubricOverall = noPairwise.Breakdown.OverallJudgmentImprovement; + + // With position-swap-inconsistent pairwise, should keep rubric scores (not zero them) + var inconsistentPairwise = new PairwiseJudgeResult( + [new PairwiseRubricResult("Q", "tie", PairwiseMagnitude.Equal, "inconsistent")], + "tie", + PairwiseMagnitude.Equal, + "Position-swap inconsistent", + PositionSwapConsistent: false); + var withInconsistent = Comparator.CompareScenario("test", baseline, withSkill, inconsistentPairwise); + Assert.Equal(rubricQuality, withInconsistent.Breakdown.QualityImprovement); + Assert.Equal(rubricOverall, withInconsistent.Breakdown.OverallJudgmentImprovement); + } }