diff --git a/eng/skill-validator/InvestigatingResults.md b/eng/skill-validator/InvestigatingResults.md index 26e2c0f497..451c8f9489 100644 --- a/eng/skill-validator/InvestigatingResults.md +++ b/eng/skill-validator/InvestigatingResults.md @@ -221,11 +221,12 @@ Several scenario-level options in `eval.yaml` are relevant when diagnosing failu - `overallJudgmentImprovement` is -0.4 even though quality scores are similar - Pairwise judge is inconsistent between position-swapped runs -**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency, but the weighted scoring still penalizes. +**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency. When this happens, the Comparator falls back to rubric-based quality scores instead of using the (zeroed-out) pairwise scores. **Fixes:** - This is usually noise — re-run the eval to see if it persists - If it consistently happens, improve the skill to produce clearly differentiated output +- Check `pairwiseResult.positionSwapConsistent` — when `false`, the quality/overall scores come from rubric scoring, not pairwise ### 8. Baseline already good (no headroom) diff --git a/eng/skill-validator/src/Evaluate/Comparator.cs b/eng/skill-validator/src/Evaluate/Comparator.cs index 1707ae6257..cd52f2a7b4 100644 --- a/eng/skill-validator/src/Evaluate/Comparator.cs +++ b/eng/skill-validator/src/Evaluate/Comparator.cs @@ -21,8 +21,11 @@ public static ScenarioComparison CompareScenario( OverallJudgmentImprovement: NormalizeScoreImprovement(baseline.JudgeResult.OverallScore, withSkill.JudgeResult.OverallScore), ErrorReduction: ComputeReduction(baseline.Metrics.ErrorCount, withSkill.Metrics.ErrorCount)); - // Override quality scores with pairwise results when available - if (pairwiseResult is not null) + // Override quality scores with pairwise results when consistent. + // When position-swap-inconsistent, keep the rubric-based scores — + // zeroing out 70% of the score on a noisy comparison is worse than + // using the (noisier but non-zero) rubric-based quality signal. + if (pairwiseResult is not null && pairwiseResult.PositionSwapConsistent) { var pairwiseScores = PairwiseJudge.PairwiseToQualityScore(pairwiseResult); breakdown = breakdown with diff --git a/eng/skill-validator/tests/Evaluate/ComparatorTests.cs b/eng/skill-validator/tests/Evaluate/ComparatorTests.cs index e44d0c2257..f4878af19e 100644 --- a/eng/skill-validator/tests/Evaluate/ComparatorTests.cs +++ b/eng/skill-validator/tests/Evaluate/ComparatorTests.cs @@ -279,4 +279,29 @@ [new PairwiseRubricResult("Q", "skill", PairwiseMagnitude.MuchBetter, "")], Assert.Equal(1.0, withPairwise.Breakdown.OverallJudgmentImprovement); Assert.Equal(pairwise, withPairwise.PairwiseResult); } + + [Fact] + public void KeepsRubricScoresWhenPairwiseIsInconsistent() + { + var baseline = MakeRunResult(overallScore: 2, rubricScores: [new RubricScore("Q", 2, "")]); + var withSkill = MakeRunResult(overallScore: 4, rubricScores: [new RubricScore("Q", 4, "")]); + + // Without pairwise, rubric-based quality should show improvement + var noPairwise = Comparator.CompareScenario("test", baseline, withSkill); + Assert.True(noPairwise.Breakdown.QualityImprovement > 0); + Assert.True(noPairwise.Breakdown.OverallJudgmentImprovement > 0); + double rubricQuality = noPairwise.Breakdown.QualityImprovement; + double rubricOverall = noPairwise.Breakdown.OverallJudgmentImprovement; + + // With position-swap-inconsistent pairwise, should keep rubric scores (not zero them) + var inconsistentPairwise = new PairwiseJudgeResult( + [new PairwiseRubricResult("Q", "tie", PairwiseMagnitude.Equal, "inconsistent")], + "tie", + PairwiseMagnitude.Equal, + "Position-swap inconsistent", + PositionSwapConsistent: false); + var withInconsistent = Comparator.CompareScenario("test", baseline, withSkill, inconsistentPairwise); + Assert.Equal(rubricQuality, withInconsistent.Breakdown.QualityImprovement); + Assert.Equal(rubricOverall, withInconsistent.Breakdown.OverallJudgmentImprovement); + } }