ModelCloud
diff --git a/‎defuser/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎defuser/__init__.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎defuser/defuser.py‎
Lines changed: 81 additions & 6 deletions b/‎defuser/defuser.py‎
Lines changed: 81 additions & 6 deletions
diff --git a/‎defuser/model_registry.py‎
Lines changed: 47 additions & 1 deletion b/‎defuser/model_registry.py‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎defuser/modeling/model_patches.py‎
Lines changed: 52 additions & 2 deletions b/‎defuser/modeling/model_patches.py‎
Lines changed: 52 additions & 2 deletions
diff --git a/‎defuser/modeling/unfused_moe/__init__.py‎ b/‎defuser/modeling/unfused_moe/__init__.py‎
diff --git a/‎defuser/modeling/unfused_moe/glm4_moe.py‎
Lines changed: 63 additions & 0 deletions b/‎defuser/modeling/unfused_moe/glm4_moe.py‎
Lines changed: 63 additions & 0 deletions
@@ -14,4 +14,12 @@ def convert_model(*args, **kwargs):
     return _convert_model(*args, **kwargs)
 
 
-__all__ = ["convert_model"]
+def replace_fused_blocks(*args, **kwargs):
+    """Lazily import conversion entrypoint to avoid import-time cycles."""
+    from .defuser import replace_fused_blocks as _replace_fused_blocks
+
+    return _replace_fused_blocks(*args, **kwargs)
+
+
+
+__all__ = ["convert_model", "replace_fused_blocks"]
@@ -2,17 +2,82 @@
 # SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
+import importlib
+
 from torch import nn
 
-from defuser.model_registry import MODEL_CONFIG
-from defuser.modeling.model_patches import apply_model_patches
+from defuser.model_registry import MODEL_CONFIG, PATCH
+from defuser.modeling.model_patches import apply_model_class_patches, apply_model_patches
 from defuser.modeling.update_module import update_module
 from packaging import version
 import transformers
 from logbar import LogBar
 
 logger = LogBar(__name__)
 
+def get_checkpoint_conversion_mapping(model_type):
+    from transformers import conversion_mapping
+
+    if not hasattr(conversion_mapping, "orig_get_checkpoint_conversion_mapping"):
+        conversion_mapping.orig_get_checkpoint_conversion_mapping = conversion_mapping.get_checkpoint_conversion_mapping
+
+    cfg = MODEL_CONFIG.get(model_type)
+    if cfg:
+        return cfg.get("checkpoint_mapping", [])
+
+    from transformers import conversion_mapping
+
+    return conversion_mapping.orig_get_checkpoint_conversion_mapping(model_type)
+
+
+class PatchError(Exception):
+    pass
+
+
+def replace_fused_blocks(model_type: str) -> bool:
+    apply_model_class_patches(model_type)
+
+    cfg = MODEL_CONFIG[model_type]
+    for orig_path, custom_path in cfg.get(PATCH.REPLACE_MODULE, []):
+        orig_module_path, orig_class_name = orig_path.rsplit(".", 1)
+        custom_module_path, custom_class_name = custom_path.rsplit(".", 1)
+
+        try:
+            orig_module = importlib.import_module(orig_module_path)
+            custom_module = importlib.import_module(custom_module_path)
+            print("orig_module", orig_module, orig_class_name)
+            # Validate class existence before patching
+            if not hasattr(orig_module, orig_class_name):
+                raise PatchError(f"Original class[{orig_class_name}] not found: {orig_module}")
+
+            if not hasattr(custom_module, custom_class_name):
+                raise PatchError(f"Custom class[{custom_class_name}] not found: {custom_module}")
+
+            custom_class = getattr(custom_module, custom_class_name)
+            setattr(orig_module, orig_class_name, custom_class)
+
+            if version.parse(transformers.__version__) >= version.parse("5.0.0"):
+                from transformers import conversion_mapping
+
+                if not hasattr(conversion_mapping, "orig_get_checkpoint_conversion_mapping"):
+                    conversion_mapping.orig_get_checkpoint_conversion_mapping = (
+                        conversion_mapping.get_checkpoint_conversion_mapping
+                    )
+
+                conversion_mapping.get_checkpoint_conversion_mapping = get_checkpoint_conversion_mapping
+                transformers.modeling_utils.get_checkpoint_conversion_mapping = get_checkpoint_conversion_mapping
+            logger.info(f"Patched {orig_path} -> {custom_path}")
+            return True
+
+        except Exception as e:
+            if isinstance(e, PatchError):
+                raise e
+
+            logger.warning(f"Failed to patch {orig_path}: {e}")
+            return False
+    return False
+
+
 def check_model_compatibility(model: nn.Module) -> bool:
     """Validate model type and transformers version compatibility."""
     config = getattr(model, "config", None)
@@ -36,7 +101,7 @@ def convert_model(
         model: nn.Module,
         cleanup_original: bool = False,
         max_layers: int | None = None,
-) -> nn.Module:
+) -> bool:
     if max_layers is not None and max_layers < 1:
         raise ValueError("max_layers must be >= 1 when provided")
 
@@ -113,14 +178,24 @@ def convert_model(
     # and the runtime model implementation that operates on defused weights.
 
     if not check_model_compatibility(model):
-        return model
+        return False
 
     apply_model_patches(model)
 
-    return update_module(
+    # If fused blocks have already been structurally replaced at load model before,
+    # there is no need to perform runtime defusing again
+    if MODEL_CONFIG[model.config.model_type].get(PATCH.REPLACE_MODULE):
+        return False
+
+    # Perform runtime defusing of fused projections
+    # Split already-loaded fused modules (e.g., gate_up_proj/down_proj) into
+    # independent expert layers: gate_proj / up_proj / down_proj
+    update_module(
         model,
         cleanup_original=cleanup_original,
         max_layers=max_layers,
     )
 
-__all__ = ["convert_model"]
+    return True
+
+__all__ = ["convert_model", "replace_fused_blocks"]
@@ -2,16 +2,41 @@
 # SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
+from enum import Enum
+
+
+class PATCH(str, Enum):
+    REPLACE_MODULE = "replace_module"
+
 
 MODEL_CONFIG = {
     "mixtral": {
         "min_transformers_version": "5.0.0",
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock",
+                "defuser.modeling.unfused_moe.mixtral.LinearMixtralSparseMoeBlock",
+            )
+        ],
     },
     "qwen2_moe": {
         "min_transformers_version": "5.0.0",
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.qwen2_moe.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock",
+                "defuser.modeling.unfused_moe.qwen2_moe.LinearQwen2MoeSparseMoeBlock",
+            )
+        ],
     },
     "qwen3_moe": {
         "min_transformers_version": "5.0.0",
+        # structure path only replaces modeling structure
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock",
+                "defuser.modeling.unfused_moe.qwen3_moe.LinearQwen3MoeSparseMoeBlock",
+            )
+        ],
     },
     "qwen3_5_moe": {
         "min_transformers_version": "5.2.0",
@@ -21,8 +46,29 @@
     },
     "qwen3_next": {
         "min_transformers_version": "5.0.0",
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.qwen3_next.modeling_qwen3_next.Qwen3NextSparseMoeBlock",
+                "defuser.modeling.unfused_moe.qwen3_next.LinearQwen3NextSparseMoeBlock",
+            )
+        ],
     },
     "qwen3_omni_moe": {
-        "min_transformers_version": "5.2.0",
+        "min_transformers_version": "5.0.0",
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe.Qwen3OmniMoeThinkerTextSparseMoeBlock",
+                "defuser.modeling.unfused_moe.qwen3_omni_moe.LinearQwen3OmniMoeThinkerTextSparseMoeBlock",
+            )
+        ],
+    },
+    "glm4_moe": {
+        "min_transformers_version": "5.0.0",
+        PATCH.REPLACE_MODULE: [
+            (
+                "transformers.models.glm4_moe.modeling_glm4_moe.Glm4MoeMoE",
+                "defuser.modeling.unfused_moe.glm4_moe.LinearGlm4MoeMoE",
+            )
+        ],
     },
 }
@@ -8,20 +8,61 @@
 from logbar import LogBar
 
 from defuser import DEBUG_ON
+import torch
 
 logger = LogBar(__name__)
 
 
+_MODEL_CLASS_PATCH_REGISTRY: dict[str, Callable] = {}
 _MODEL_PATCH_REGISTRY: dict[str, Callable] = {}
 
 
+def register_model_class_patch(model_type: str):
+    def decorator(func: Callable):
+        _MODEL_CLASS_PATCH_REGISTRY[model_type] = func
+        return func
+
+    return decorator
+
+
 def register_model_patch(model_type: str):
     def decorator(func: Callable):
         _MODEL_PATCH_REGISTRY[model_type] = func
         return func
 
     return decorator
 
+@register_model_class_patch("qwen3_omni_moe")
+def patch_qwen3_omni_text_class() -> list[str]:
+    from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoePreTrainedModel
+    from defuser.modeling.unfused_moe.qwen3_omni_moe import LinearQwen3OmniMoeThinkerTextSparseMoeBlock
+    orig_init_weights = Qwen3OmniMoePreTrainedModel._init_weights
+
+    def patched_init_weights(self, module):
+        try:
+            orig_init_weights(self, module)
+        except AttributeError as e:
+            # fallback for unfused experts
+            if isinstance(module, LinearQwen3OmniMoeThinkerTextSparseMoeBlock):
+                std = self.config.initializer_range
+                experts = module.experts
+
+                if hasattr(experts, "gate_proj"):
+                    torch.nn.init.normal_(experts.gate_proj.weight, 0.0, std)
+                if hasattr(experts, "up_proj"):
+                    torch.nn.init.normal_(experts.up_proj.weight, 0.0, std)
+                if hasattr(experts, "down_proj"):
+                    torch.nn.init.normal_(experts.down_proj.weight, 0.0, std)
+
+                if hasattr(module, "gate"):
+                    torch.nn.init.normal_(module.gate.weight, 0.0, std)
+            else:
+                raise e
+
+    Qwen3OmniMoePreTrainedModel._init_weights = patched_init_weights
+
+    return []
+
 
 @register_model_patch("qwen3_omni_moe")
 def patch_qwen3_omni_text_runtime(model) -> list[str]:
@@ -43,7 +84,6 @@ def generate(self, *args, return_audio=None, **kwargs):
         applied.append("generate")
 
     if "forward" not in model_cls.__dict__:
-
         def forward(self, *args, **kwargs):
             return self.thinker(*args, **kwargs)
 
@@ -54,6 +94,17 @@ def forward(self, *args, **kwargs):
     return applied
 
 
+def apply_model_class_patches(model_type) -> list[str]:
+    patch_model_class = _MODEL_CLASS_PATCH_REGISTRY.get(model_type)
+    if patch_model_class is None:
+        return []
+
+    applied = patch_model_class()
+    if applied and DEBUG_ON:
+        logger.debug(f"Applied model class patches for model_type={model_type}: {', '.join(applied)}")
+    return applied
+
+
 def apply_model_patches(model) -> list[str]:
     config = getattr(model, "config", None)
     model_type = getattr(config, "model_type", None)
@@ -65,4 +116,3 @@ def apply_model_patches(model) -> list[str]:
     if applied and DEBUG_ON:
         logger.debug(f"Applied model patches for model_type={model_type}: {', '.join(applied)}")
     return applied
-
 
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+import torch
+import torch.nn as nn
+
+class LinearGlm4MoeMoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeMLP, Glm4MoeTopkRouter
+
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                Glm4MoeMLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = Glm4MoeTopkRouter(config)
+        self.shared_experts = Glm4MoeMLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states