fix: address code review issues in RLHF DPO implementation

BitcrushedHeart · BitcrushedHeart · commit 3c762322edd6 · 2026-04-02T16:28:37.000+01:00
- Fix failing export test to match 7-value return signature
- Add ValueError for unsupported DPO reference modes
- Remove dead hasattr check in reference_model
- Close PIL file handles promptly in DPO curation window
- Read only first 256KB for metadata extraction with fallback
- Add config migration 14 for transfer_* fields
- Add DPO loss math integration tests including beta=0 sanity check
diff --git a/modules/modelSetup/BaseModelSetup.py b/modules/modelSetup/BaseModelSetup.py
@@ -313,10 +313,7 @@ def reference_model(self, model: BaseModel, config: TrainConfig):
         if len(adapters) == 0:
             raise RuntimeError("RLHF DPO requires active adapters, but no trainable adapters are attached to the current model.")
 
-        if hasattr(config, "effective_dpo_ref_mode"):
-            ref_mode = config.effective_dpo_ref_mode()
-        else:
-            ref_mode = DPORefMode.EXISTING_ADAPTER if config.lora_model_name else DPORefMode.NEW_ADAPTER
+        ref_mode = config.effective_dpo_ref_mode()
 
         if ref_mode == DPORefMode.NEW_ADAPTER:
             for adapter in adapters:
@@ -349,6 +346,8 @@ def reference_model(self, model: BaseModel, config: TrainConfig):
                 for adapter, policy_ptrs in zip(adapters, policy_data, strict=True):
                     for param, policy_ptr in zip(adapter.parameters(), policy_ptrs, strict=True):
                         param.data = policy_ptr
+        else:
+            raise ValueError(f"Unsupported DPO reference mode: {ref_mode}")
 
     def _create_model_part_parameters(
         self,
diff --git a/modules/ui/DPOCurationWindow.py b/modules/ui/DPOCurationWindow.py
@@ -308,8 +308,8 @@ def _build_selection_ui(self):
     def _display_thumbnail(self, master, path: str, row: int, col: int):
         thumb_size = 250
         try:
-            pil_img = Image.open(path)
-            pil_img = self._fit_image(pil_img, thumb_size, thumb_size)
+            with Image.open(path) as _raw:
+                pil_img = self._fit_image(_raw, thumb_size, thumb_size).copy()
             ctk_img = ctk.CTkImage(light_image=pil_img, size=pil_img.size)
 
             label = ctk.CTkLabel(master, text="", image=ctk_img)
@@ -328,9 +328,9 @@ def _selection_preview(self, path: str):
         preview.focus_set()
 
         try:
-            pil_img = Image.open(path)
             sw, sh = preview.winfo_screenwidth(), preview.winfo_screenheight()
-            pil_img = self._fit_image(pil_img, sw, sh - 50)
+            with Image.open(path) as _raw:
+                pil_img = self._fit_image(_raw, sw, sh - 50).copy()
             ctk_img = ctk.CTkImage(light_image=pil_img, size=pil_img.size)
 
             label = ctk.CTkLabel(preview, text="", image=ctk_img)
@@ -416,13 +416,13 @@ def _toggle():
 
     def _display_image(self, master, path: str, row: int, col: int):
         try:
-            pil_img = Image.open(path)
             self.update_idletasks()
             win_w = self.winfo_width() or self.winfo_screenwidth()
             win_h = self.winfo_height() or self.winfo_screenheight()
             max_w = max(400, win_w // 2 - 40)
             max_h = max(400, win_h - 200)
-            pil_img = self._fit_image(pil_img, max_w, max_h)
+            with Image.open(path) as _raw:
+                pil_img = self._fit_image(_raw, max_w, max_h).copy()
             ctk_img = ctk.CTkImage(light_image=pil_img, size=pil_img.size)
 
             label = ctk.CTkLabel(master, text="", image=ctk_img)
diff --git a/modules/util/config/TrainConfig.py b/modules/util/config/TrainConfig.py
@@ -601,6 +601,7 @@ def __init__(self, data: list[(str, Any, type, bool)]):
                 11: self.__migration_11,
                 12: self.__migration_12,
                 13: self.__migration_13,
+                14: self.__migration_14,
             }
         )
 
@@ -849,6 +850,14 @@ def __migration_13(self, data: dict) -> dict:
         migrated_data.setdefault("rlhf_dpo_execution_mode", "SEQUENTIAL")
         return migrated_data
 
+    def __migration_14(self, data: dict) -> dict:
+        migrated_data = data.copy()
+        migrated_data.setdefault("transfer_step1", False)
+        migrated_data.setdefault("transfer_step2", False)
+        migrated_data.setdefault("transfer_guidance", 3.0)
+        migrated_data.setdefault("transfer_train_lora", False)
+        return migrated_data
+
     def effective_dpo_ref_mode(self) -> DPORefMode:
         return DPORefMode.EXISTING_ADAPTER if self.lora_model_name else DPORefMode.NEW_ADAPTER
 
diff --git a/modules/util/image_metadata_util.py b/modules/util/image_metadata_util.py
@@ -53,8 +53,11 @@ def strip_angle_bracket_segments(prompt: str) -> str:
 
 def _extract_raw_metadata(path: str) -> dict:
     """Scan raw file bytes for plaintext metadata (JPEG, WebP, etc.)."""
+    CHUNK = 256 * 1024  # 256 KB covers metadata in most formats
     with open(path, 'rb') as f:
-        raw = f.read()
+        raw = f.read(CHUNK)
+        if b'"sui_image_params"' not in raw and b'"prompt"' not in raw:
+            raw = raw + f.read()  # fall back to full read
     # Preserve non-ASCII bytes instead of silently dropping them.
     text = raw.decode('utf-8', errors='surrogateescape')
 
diff --git a/tests/util/test_dpo_curation_util.py b/tests/util/test_dpo_curation_util.py
@@ -0,0 +1,77 @@
+import tempfile
+import unittest
+from pathlib import Path
+
+from modules.util.dpo_curation_util import export_curated_pairs
+
+from PIL import Image
+
+
+class DPOCurationUtilExportTest(unittest.TestCase):
+    def test_export_pairs_copies_images_and_strips_angle_bracket_metadata_from_captions(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            source_dir = temp_path / "source"
+            output_dir = temp_path / "output"
+            source_dir.mkdir()
+
+            chosen_path = source_dir / "chosen.png"
+            rejected_path = source_dir / "rejected.jpg"
+
+            Image.new("RGB", (8, 8), color="red").save(chosen_path)
+            Image.new("RGB", (8, 8), color="blue").save(rejected_path)
+
+            groups = [{
+                "prompt": "portrait, <wildcard:hair>, cinematic light, <segment:face>, sharp focus",
+                "aspectratio": "1:1",
+                "images": [str(chosen_path), str(rejected_path)],
+            }]
+            results = {
+                0: [
+                    {
+                        "chosen": str(chosen_path),
+                        "rejected": str(rejected_path),
+                    }
+                ]
+            }
+
+            chosen_dir, rejected_dir, chosen_val_dir, rejected_val_dir, skipped, val_count, train_count = \
+                export_curated_pairs(groups, results, str(output_dir), val_percentage=0.0)
+
+            self.assertEqual(skipped, 0)
+
+            exported_chosen_path = Path(chosen_dir) / "pair_0000_0000.png"
+            exported_rejected_path = Path(rejected_dir) / "pair_0000_0000.jpg"
+            exported_chosen_caption = Path(chosen_dir) / "pair_0000_0000.txt"
+            exported_rejected_caption = Path(rejected_dir) / "pair_0000_0000.txt"
+
+            self.assertTrue(exported_chosen_path.exists())
+            self.assertTrue(exported_rejected_path.exists())
+            self.assertEqual(exported_chosen_path.read_bytes(), chosen_path.read_bytes())
+            self.assertEqual(exported_rejected_path.read_bytes(), rejected_path.read_bytes())
+
+            expected_caption = "portrait, cinematic light, sharp focus"
+            self.assertEqual(exported_chosen_caption.read_text(encoding="utf-8"), expected_caption)
+            self.assertEqual(exported_rejected_caption.read_text(encoding="utf-8"), expected_caption)
+
+    def test_has_existing_exports_detects_prior_output(self):
+        from modules.util.dpo_curation_util import has_existing_exports
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Empty directory — no prior exports
+            self.assertFalse(has_existing_exports(temp_dir))
+
+            # Create a chosen dir with a pair file
+            chosen_dir = Path(temp_dir) / "chosen"
+            chosen_dir.mkdir()
+            (chosen_dir / "pair_0000.png").write_bytes(b"fake")
+            self.assertTrue(has_existing_exports(temp_dir))
+
+    def test_has_existing_exports_ignores_non_pair_files(self):
+        from modules.util.dpo_curation_util import has_existing_exports
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            chosen_dir = Path(temp_dir) / "chosen"
+            chosen_dir.mkdir()
+            (chosen_dir / "something_else.png").write_bytes(b"fake")
+            self.assertFalse(has_existing_exports(temp_dir))
diff --git a/tests/util/test_dpo_loss.py b/tests/util/test_dpo_loss.py
@@ -0,0 +1,116 @@
+import math
+
+import torch
+import torch.nn.functional as F
+
+
+def _mse_per_sample(pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    """Reproduce the per-sample MSE used in calculate_dpo_loss."""
+    return (pred - target).pow(2).mean(dim=list(range(1, pred.ndim)))
+
+
+def _dpo_loss(
+    policy_chosen_logp: torch.Tensor,
+    policy_rejected_logp: torch.Tensor,
+    ref_chosen_logp: torch.Tensor,
+    ref_rejected_logp: torch.Tensor,
+    beta: float,
+    label_smoothing: float = 0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Reproduce the DPO loss computation from BaseModelSetup.calculate_dpo_loss."""
+    chosen_ratio = policy_chosen_logp - ref_chosen_logp
+    rejected_ratio = policy_rejected_logp - ref_rejected_logp
+    logits = beta * (chosen_ratio - rejected_ratio)
+    dpo_loss = -F.logsigmoid(logits).mean()
+
+    if label_smoothing > 0:
+        smooth_loss = -F.logsigmoid(-logits).mean()
+        loss = (1 - label_smoothing) * dpo_loss + label_smoothing * smooth_loss
+    else:
+        loss = dpo_loss
+
+    chosen_reward = chosen_ratio.detach().mean()
+    rejected_reward = rejected_ratio.detach().mean()
+    accuracy = (chosen_ratio > rejected_ratio).float().mean()
+
+    return loss, dpo_loss, chosen_reward, rejected_reward, accuracy
+
+
+class TestDPOLossMath:
+    def test_beta_zero_gives_log2(self):
+        """At beta=0, logits collapse to 0 and -log(sigmoid(0)) = log(2)."""
+        B = 4
+        policy_chosen_logp = torch.randn(B)
+        policy_rejected_logp = torch.randn(B)
+        ref_chosen_logp = torch.randn(B)
+        ref_rejected_logp = torch.randn(B)
+
+        loss, dpo_loss, _, _, _ = _dpo_loss(
+            policy_chosen_logp, policy_rejected_logp,
+            ref_chosen_logp, ref_rejected_logp,
+            beta=0.0,
+        )
+        assert abs(loss.item() - math.log(2)) < 1e-6
+
+    def test_perfect_preference_gives_low_loss(self):
+        """When policy strongly prefers chosen over rejected, loss should be low."""
+        B = 4
+        policy_chosen_logp = torch.tensor([0.0] * B)
+        policy_rejected_logp = torch.tensor([-10.0] * B)
+        ref_chosen_logp = torch.tensor([-5.0] * B)
+        ref_rejected_logp = torch.tensor([-5.0] * B)
+
+        loss, _, chosen_reward, rejected_reward, accuracy = _dpo_loss(
+            policy_chosen_logp, policy_rejected_logp,
+            ref_chosen_logp, ref_rejected_logp,
+            beta=5000.0,
+        )
+        assert accuracy.item() == 1.0
+        assert chosen_reward.item() > rejected_reward.item()
+        assert loss.item() < 0.01
+
+    def test_inverted_preference_gives_high_loss(self):
+        """When policy prefers rejected over chosen, loss should be high."""
+        B = 4
+        policy_chosen_logp = torch.tensor([-10.0] * B)
+        policy_rejected_logp = torch.tensor([0.0] * B)
+        ref_chosen_logp = torch.tensor([-5.0] * B)
+        ref_rejected_logp = torch.tensor([-5.0] * B)
+
+        loss, _, _, _, accuracy = _dpo_loss(
+            policy_chosen_logp, policy_rejected_logp,
+            ref_chosen_logp, ref_rejected_logp,
+            beta=5000.0,
+        )
+        assert accuracy.item() == 0.0
+        assert loss.item() > 10.0
+
+    def test_label_smoothing_reduces_extreme_loss(self):
+        """Label smoothing should make loss less extreme for both directions."""
+        B = 4
+        policy_chosen_logp = torch.tensor([0.0] * B)
+        policy_rejected_logp = torch.tensor([-10.0] * B)
+        ref_chosen_logp = torch.tensor([-5.0] * B)
+        ref_rejected_logp = torch.tensor([-5.0] * B)
+
+        loss_no_smooth, _, _, _, _ = _dpo_loss(
+            policy_chosen_logp, policy_rejected_logp,
+            ref_chosen_logp, ref_rejected_logp,
+            beta=5000.0, label_smoothing=0.0,
+        )
+        loss_smooth, _, _, _, _ = _dpo_loss(
+            policy_chosen_logp, policy_rejected_logp,
+            ref_chosen_logp, ref_rejected_logp,
+            beta=5000.0, label_smoothing=0.1,
+        )
+        assert loss_smooth.item() > loss_no_smooth.item()
+
+    def test_mse_per_sample_reduces_correctly(self):
+        """MSE reduction should produce [B] shape from [B, C, H, W]."""
+        B, C, H, W = 2, 4, 8, 8
+        pred = torch.randn(B, C, H, W)
+        target = torch.randn(B, C, H, W)
+        result = _mse_per_sample(pred, target)
+        assert result.shape == (B,)
+        expected_0 = (pred[0] - target[0]).pow(2).mean()
+        assert abs(result[0].item() - expected_0.item()) < 1e-6