Nerogar · m4xw · Mar 5, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/modules/modelLoader/mixin/LoRALoaderMixin.py b/modules/modelLoader/mixin/LoRALoaderMixin.py
@@ -19,6 +19,48 @@ def __init__(self):
     def _get_convert_key_sets(self, model: BaseModel) -> list[LoraConversionKeySet] | None:
         pass
 
+    @staticmethod
+    def scale_lora_state_dict(
+            state_dict: dict,
+            te_scale: float = 1.0,
+            unet_scale: float = 1.0,
+    ) -> dict:
+        """
+        Scales LoRA weights for Text Encoder and main component (UNet/Transformer) separately.
+
+        Args:
+            state_dict: The LoRA state dict to scale
+            te_scale: Scale factor for Text Encoder LoRA weights (default 1.0, applies to lora_te*)
+            unet_scale: Scale factor for main component LoRA weights (default 1.0, applies to everything else)
+
+        Returns:
+            The scaled state dict
+        """
+        scaled_dict = {}
+
+        weight_suffixes = (
+            ".weight",
+            "hada_w1_a",
+            "hada_w1_b",
+            "hada_w2_a",
+            "hada_w2_b",
+            "lokr_w1",
+            "lokr_w2",
+            "lokr_t1",
+            "lokr_t2",
+        )
+
+        for key, value in state_dict.items():
+            is_weight = isinstance(value, torch.Tensor) and key.endswith(weight_suffixes)
+            if key.startswith("lora_te"):
+                # Text Encoder LoRA (matches lora_te, lora_te1, lora_te2, etc.)
+                scaled_dict[key] = value * te_scale if is_weight else value
+            else:
+                # Other components: unet, transformer, prior, decoder, etc.
+                scaled_dict[key] = value * unet_scale if is_weight else value
+
+        return scaled_dict
+
     def __load_safetensors(
             self,
             model: BaseModel,

diff --git a/modules/modelSetup/BaseModelSetup.py b/modules/modelSetup/BaseModelSetup.py
@@ -2,11 +2,13 @@
 from contextlib import contextmanager
 
 from modules.model.BaseModel import BaseModel
+from modules.module.ParentModelWrapper import ParentModelWrapper
 from modules.util.config.TrainConfig import TrainConfig, TrainEmbeddingConfig, TrainModelPartConfig
 from modules.util.enum.TrainingMethod import TrainingMethod
 from modules.util.ModuleFilter import ModuleFilter
 from modules.util.NamedParameterGroup import NamedParameterGroup, NamedParameterGroupCollection
 from modules.util.TimedActionMixin import TimedActionMixin
+from modules.util.torch_util import torch_gc
 from modules.util.TrainProgress import TrainProgress
 
 import torch
@@ -73,6 +75,7 @@ def predict(
             train_progress: TrainProgress,
             *,
             deterministic: bool = False,
+            generate_distillation_empty: bool = False,
     ) -> dict:
         pass
 
@@ -177,6 +180,62 @@ def prior_model(self, model: BaseModel, config: TrainConfig):
             for adapter in model.adapters():
                 adapter.hook_to_module()
 
+    @contextmanager
+    def distillation_parent_model(
+        self,
+        model: BaseModel,
+        config: TrainConfig,
+        parent_wrapper: ParentModelWrapper | None,
+    ):
+        """
+        Context manager for distillation with external parent model.
+
+        If parent_wrapper is provided and distillation is enabled, loads the parent
+        model temporarily to train_device for inference. Otherwise falls back to
+        prior_model behavior (unhooking LoRA adapters).
+
+        Args:
+            model: Student model being trained
+            config: Training configuration
+            parent_wrapper: Optional wrapper containing parent model
+
+        Yields:
+            Parent model if available, otherwise the student model with adapters unhooked
+        """
+        if parent_wrapper is None or not config.distillation.enabled:
+            # Fallback to prior_model behavior
+            with self.prior_model(model, config):
+                yield model
+            return
+
+        # Load parent model if not already loaded
+        if not parent_wrapper.is_loaded():
+            parent_wrapper.load_parent_model()
+
+        # Memory optimization: Swap models if keeping parent on CPU
+        # Move student model to CPU before loading parent to GPU to reduce peak VRAM
+        student_was_moved = False
+        if config.distillation.keep_parent_on_cpu:
+            model.to(self.temp_device)
+            student_was_moved = True
+            torch_gc()
+
+        # Move parent to train_device temporarily
+        parent_wrapper.to_device(self.train_device)
+
+        try:
+            yield parent_wrapper.parent_model
+        finally:
+            # Move parent back to temp_device (CPU)
+            if config.distillation.keep_parent_on_cpu:
+                parent_wrapper.to_device(self.temp_device)
+                torch_gc()
+
+                # Move student model back to train_device
+                if student_was_moved:
+                    model.to(self.train_device)
+                    torch_gc()
+
     def _create_model_part_parameters(
         self,
         parameter_group_collection: NamedParameterGroupCollection,
@@ -186,7 +245,7 @@ def _create_model_part_parameters(
         freeze: list[ModuleFilter] | None = None,
         debug: bool = False,
     ):
-        if not config.train:
+        if not config.train or model is None:
             return
 
         if freeze is not None and len(freeze) > 0:

diff --git a/modules/modelSetup/BaseStableDiffusionXLSetup.py b/modules/modelSetup/BaseStableDiffusionXLSetup.py
@@ -193,6 +193,7 @@ def predict(
             train_progress: TrainProgress,
             *,
             deterministic: bool = False,
+            generate_distillation_empty: bool = False,
     ) -> dict:
         with model.autocast_context:
             batch_seed = 0 if deterministic else train_progress.global_step * multi.world_size() + multi.rank()
@@ -370,6 +371,49 @@ def predict(
                     )
 
         model_output_data['prediction_type'] = model.noise_scheduler.config.prediction_type
+
+        # For CFG_DISTILL: Generate empty prompt prediction
+        if generate_distillation_empty and config.distillation.enabled \
+            and config.distillation.target_mode.value == 'CFG_DISTILL':
+            with torch.no_grad():
+                # Create empty text embeddings (as unconditional guidance)
+                empty_text_encoder_output, empty_pooled_text_encoder_2_output = model.combine_text_encoder_output(
+                    *model.encode_text(
+                        train_device=self.train_device,
+                        batch_size=batch['latent_image'].shape[0],
+                        rand=rand,
+                        text="",
+                        tokens_1=None,
+                        tokens_2=None,
+                        text_encoder_1_layer_skip=config.text_encoder_layer_skip,
+                        text_encoder_2_layer_skip=config.text_encoder_2_layer_skip,
+                        text_encoder_1_output=None,
+                        text_encoder_2_output=None,
+                        pooled_text_encoder_2_output=None,
+                        text_encoder_1_dropout_probability=0.0,
+                        text_encoder_2_dropout_probability=0.0,
+                    )
+                )
+
+                # Create latent input (same structure, but with empty conditioning)
+                if config.model_type.has_mask_input() and config.model_type.has_conditioning_image_input():
+                    empty_latent_input = torch.concat(
+                        [scaled_noisy_latent_image, batch['latent_mask'], scaled_latent_conditioning_image], 1
+                    )
+                else:
+                    empty_latent_input = scaled_noisy_latent_image
+
+                # Run UNet with empty conditioning
+                empty_added_cond_kwargs = {"text_embeds": empty_pooled_text_encoder_2_output, "time_ids": add_time_ids}
+                predicted_latent_noise_empty = model.unet(
+                    sample=empty_latent_input.to(dtype=model.train_dtype.torch_dtype()),
+                    timestep=timestep,
+                    encoder_hidden_states=empty_text_encoder_output.to(dtype=model.train_dtype.torch_dtype()),
+                    added_cond_kwargs=empty_added_cond_kwargs,
+                ).sample
+
+                model_output_data['predicted_empty'] = predicted_latent_noise_empty
+
         return model_output_data
 
     def calculate_loss(

diff --git a/modules/modelSetup/StableDiffusionLoRASetup.py b/modules/modelSetup/StableDiffusionLoRASetup.py
@@ -1,6 +1,7 @@
 from modules.model.StableDiffusionModel import StableDiffusionModel
 from modules.modelSetup.BaseModelSetup import BaseModelSetup
 from modules.modelSetup.BaseStableDiffusionSetup import BaseStableDiffusionSetup
+from modules.modelLoader.mixin.LoRALoaderMixin import LoRALoaderMixin
 from modules.module.LoRAModule import LoRAModuleWrapper
 from modules.util import factory
 from modules.util.config.TrainConfig import TrainConfig
@@ -69,7 +70,7 @@ def setup_model(
         if config.train_any_embedding():
             model.text_encoder.get_input_embeddings().to(dtype=config.embedding_weight_dtype.torch_dtype())
 
-        create_te = config.text_encoder.train or state_dict_has_prefix(model.lora_state_dict, "lora_te")
+        create_te = config.text_encoder.train
         model.text_encoder_lora = LoRAModuleWrapper(
             model.text_encoder, "lora_te", config
         ) if create_te else None
@@ -79,6 +80,13 @@ def setup_model(
         )
 
         if model.lora_state_dict:
+            # Apply scaling factors to LoRA weights before loading
+            model.lora_state_dict = LoRALoaderMixin.scale_lora_state_dict(
+                model.lora_state_dict,
+                te_scale=config.lora_te_scale,
+                unet_scale=config.lora_unet_scale,
+            )
+
             if create_te:
                 model.text_encoder_lora.load_state_dict(model.lora_state_dict)
             model.unet_lora.load_state_dict(model.lora_state_dict)

diff --git a/modules/modelSetup/StableDiffusionXLLoRASetup.py b/modules/modelSetup/StableDiffusionXLLoRASetup.py
@@ -1,6 +1,7 @@
 from modules.model.StableDiffusionXLModel import StableDiffusionXLModel
 from modules.modelSetup.BaseModelSetup import BaseModelSetup
 from modules.modelSetup.BaseStableDiffusionXLSetup import BaseStableDiffusionXLSetup
+from modules.modelLoader.mixin.LoRALoaderMixin import LoRALoaderMixin
 from modules.module.LoRAModule import LoRAModuleWrapper
 from modules.util import factory
 from modules.util.config.TrainConfig import TrainConfig
@@ -76,8 +77,8 @@ def setup_model(
             model: StableDiffusionXLModel,
             config: TrainConfig,
     ):
-        create_te1 = config.text_encoder.train or state_dict_has_prefix(model.lora_state_dict, "lora_te1")
-        create_te2 = config.text_encoder_2.train or state_dict_has_prefix(model.lora_state_dict, "lora_te2")
+        create_te1 = config.text_encoder.train
+        create_te2 = config.text_encoder_2.train
 
         model.text_encoder_1_lora = LoRAModuleWrapper(
             model.text_encoder_1, "lora_te1", config
@@ -92,6 +93,13 @@ def setup_model(
         )
 
         if model.lora_state_dict:
+            # Apply scaling factors to LoRA weights before loading
+            model.lora_state_dict = LoRALoaderMixin.scale_lora_state_dict(
+                model.lora_state_dict,
+                te_scale=config.lora_te_scale,
+                unet_scale=config.lora_unet_scale,
+            )
+
             if create_te1:
                 model.text_encoder_1_lora.load_state_dict(model.lora_state_dict)
             if create_te2:

diff --git a/modules/modelSetup/mixin/ModelSetupDiffusionLossMixin.py b/modules/modelSetup/mixin/ModelSetupDiffusionLossMixin.py
@@ -4,7 +4,7 @@
 from modules.util.config.TrainConfig import TrainConfig
 from modules.util.DiffusionScheduleCoefficients import DiffusionScheduleCoefficients
 from modules.util.enum.LossWeight import LossWeight
-from modules.util.loss.masked_loss import masked_losses, masked_losses_with_prior
+from modules.util.loss.masked_loss import masked_losses, masked_losses_with_prior, distillation_loss
 from modules.util.loss.vb_loss import vb_losses
 
 import torch
@@ -134,6 +134,21 @@ def __masked_losses(
                 normalize_masked_area_loss=config.normalize_masked_area_loss,
             ).mean(mean_dim) * config.vb_loss_strength
 
+        # Distillation loss
+        if config.distillation.enabled and 'prior_target' in data and 'distillation_indices' in data:
+            distillation_indices = data['distillation_indices']
+            if len(distillation_indices) > 0:
+                # Calculate distillation loss only for samples marked as DISTILLATION
+                dist_loss = distillation_loss(
 model_output_data['target'][prior_pred_indices] = prior_model_prediction[prior_pred_indices] 
 model_output_data['target'][prior_pred_indices] = prior_model_prediction[prior_pred_indices] 
+                    student_prediction=data['predicted'][distillation_indices].to(dtype=torch.float32),
+                    parent_prediction=data['prior_target'][distillation_indices].to(dtype=torch.float32),
+                    loss_type=config.distillation.loss_type,
+                    temperature=config.distillation.kl_temperature,
+                    mask=batch['latent_mask'][distillation_indices].to(dtype=torch.float32) if config.masked_training else None,
+                    reduction='mean',
+                )
+                losses += dist_loss * config.distillation.loss_weight
+
         return losses
 
     def __unmasked_losses(