dotnet · steveisok · Mar 30, 2026 · Copilot · Mar 30, 2026
@@ -221,11 +221,12 @@ Several scenario-level options in `eval.yaml` are relevant when diagnosing failu
 - `overallJudgmentImprovement` is -0.4 even though quality scores are similar
 - Pairwise judge is inconsistent between position-swapped runs
 
-**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency, but the weighted scoring still penalizes.
+**Cause:** When outputs are nearly equal, the judge's position bias can dominate. The position-swap mitigation defaults to "tie" on inconsistency. When this happens, the Comparator falls back to rubric-based quality scores instead of using the (zeroed-out) pairwise scores.
 
 **Fixes:**
 - This is usually noise — re-run the eval to see if it persists
 - If it consistently happens, improve the skill to produce clearly differentiated output
+- Check `pairwiseResult.positionSwapConsistent` — when `false`, the quality/overall scores come from rubric scoring, not pairwise
 
 ### 8. Baseline already good (no headroom)
 

@@ -21,8 +21,11 @@ public static ScenarioComparison CompareScenario(
             OverallJudgmentImprovement: NormalizeScoreImprovement(baseline.JudgeResult.OverallScore, withSkill.JudgeResult.OverallScore),
             ErrorReduction: ComputeReduction(baseline.Metrics.ErrorCount, withSkill.Metrics.ErrorCount));
 
-        // Override quality scores with pairwise results when available
-        if (pairwiseResult is not null)
+        // Override quality scores with pairwise results when consistent.
+        // When position-swap-inconsistent, keep the rubric-based scores —
+        // zeroing out 70% of the score on a noisy comparison is worse than
-        // zeroing out 70% of the score on a noisy comparison is worse than
+        // zeroing out most of the weighted score on a noisy comparison is worse than
-        // zeroing out 70% of the score on a noisy comparison is worse than
+        // zeroing out most of the weighted score on a noisy comparison is worse than
+        // using the (noisier but non-zero) rubric-based quality signal.
+        if (pairwiseResult is not null && pairwiseResult.PositionSwapConsistent)
         {
             var pairwiseScores = PairwiseJudge.PairwiseToQualityScore(pairwiseResult);
             breakdown = breakdown with

@@ -279,4 +279,29 @@ [new PairwiseRubricResult("Q", "skill", PairwiseMagnitude.MuchBetter, "")],
         Assert.Equal(1.0, withPairwise.Breakdown.OverallJudgmentImprovement);
         Assert.Equal(pairwise, withPairwise.PairwiseResult);
     }
+
+    [Fact]
+    public void KeepsRubricScoresWhenPairwiseIsInconsistent()
+    {
+        var baseline = MakeRunResult(overallScore: 2, rubricScores: [new RubricScore("Q", 2, "")]);
+        var withSkill = MakeRunResult(overallScore: 4, rubricScores: [new RubricScore("Q", 4, "")]);
+
+        // Without pairwise, rubric-based quality should show improvement
+        var noPairwise = Comparator.CompareScenario("test", baseline, withSkill);
+        Assert.True(noPairwise.Breakdown.QualityImprovement > 0);
+        Assert.True(noPairwise.Breakdown.OverallJudgmentImprovement > 0);
+        double rubricQuality = noPairwise.Breakdown.QualityImprovement;
+        double rubricOverall = noPairwise.Breakdown.OverallJudgmentImprovement;
+
+        // With position-swap-inconsistent pairwise, should keep rubric scores (not zero them)
+        var inconsistentPairwise = new PairwiseJudgeResult(
+            [new PairwiseRubricResult("Q", "tie", PairwiseMagnitude.Equal, "inconsistent")],
+            "tie",
+            PairwiseMagnitude.Equal,
+            "Position-swap inconsistent",
+            PositionSwapConsistent: false);
+        var withInconsistent = Comparator.CompareScenario("test", baseline, withSkill, inconsistentPairwise);
+        Assert.Equal(rubricQuality, withInconsistent.Breakdown.QualityImprovement);
+        Assert.Equal(rubricOverall, withInconsistent.Breakdown.OverallJudgmentImprovement);
+    }
 }