From 3e4c8f8faf497bdf2e02d40381fbc1a673516992 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Fri, 19 Dec 2025 20:07:14 +0000
Subject: [PATCH 01/20] Add Gemma3nVisionModel - MobileNetV5 vision encoder
 convertor to convert_hf_to_gguf.py. Add gemma3n to vision projectors in
 gguf-py/gguf/constants.py.

---
 convert_hf_to_gguf.py     | 241 +++++++++++++++++++++++++++++++++++++-
 gguf-py/gguf/constants.py |   2 +
 2 files changed, 241 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 432be599469..36a7ed000af 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -520,7 +520,11 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         return ()
 
     def prepare_tensors(self):
-        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
+        if self.tensor_map.mapping:
+            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+        else:
+            max_name_len = len("vision_encoder.weight,")  # Default reasonable length
 
         for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
             # we don't need these
@@ -5959,8 +5963,182 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+@ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel")
+class Gemma3nVisionModel(MmprojModel):
+    """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
+
+    # MobileNetV5 doesn't have transformer layers, so we don't need block count
+    # Set n_block_keys to empty list to skip the find_hparam check
+    n_block_keys = []
+
+    def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
+        """Override to return 0 for block count since MobileNetV5 is CNN-based"""
+        if not keys:  # If n_block_keys is empty (our case)
+            return 0
+        # Otherwise use parent implementation
+        return super().find_hparam(keys, optional)
+
+    def __init__(self, *args, **kwargs):
+        # Parent init will call find_hparam which now returns 0 for empty keys
+        super().__init__(*args, **kwargs)
+
+    def find_vparam(self, keys: list[str], optional: bool = False) -> Any:
+        """Override to provide hardcoded MobileNetV5 parameters that aren't in config"""
+        # MobileNetV5 hardcodes these values in the architecture definition
+        # rather than storing them in config.json
+
+        # Handle empty keys list (n_block_keys) - return 0 for CNN architecture
+        if not keys:
+            return 0
+
+        # Check if we're looking for image_size
+        if "image_size" in keys:
+            # MobileNetV5 300m_enc uses 768x768 input
+            return 768
+
+        # Check if we're looking for patch_size
+        if "patch_size" in keys:
+            # MobileNetV5 is CNN-based, doesn't use patches
+            # Set to 1 for compatibility
+            return 1
+
+        # Check if we're looking for intermediate_size
+        if "intermediate_size" in keys:
+            # MobileNetV5 uses expansion ratios in inverted residual blocks
+            # Typical expansion is 4x the embedding dimension
+            hidden_size = self.hparams_vision.get("hidden_size", 2048)
+            return hidden_size * 4
+
+        # Check if we're looking for num_attention_heads
+        if "num_attention_heads" in keys or "num_heads" in keys:
+            # MobileNetV5 uses Multi-Query Attention with 8 heads
+            return 8
+
+        # For other parameters, use parent implementation
+        return super().find_vparam(keys, optional)
+
+    def set_gguf_parameters(self):
+        # MobileNetV5 requires ImageNet normalization values
+        # Override preprocessor_config to ensure correct values before calling super()
+        # IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        # IMAGENET_STD = [0.229, 0.224, 0.225]
+        IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ]
+        IMAGENET_STD = [0.5 , 0.5 , 0.5 ]
+
+        print("test")
+
+        # Check if preprocessor_config has incorrect normalization values
+        if "image_mean" in self.preprocessor_config:
+            current_mean = self.preprocessor_config["image_mean"]
+            if current_mean != IMAGENET_MEAN:
+                logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}")
+                self.preprocessor_config["image_mean"] = IMAGENET_MEAN
+            print("test2")
+        else:
+            logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}")
+            self.preprocessor_config["image_mean"] = IMAGENET_MEAN
+
+        if "image_std" in self.preprocessor_config:
+            current_std = self.preprocessor_config["image_std"]
+            if current_std != IMAGENET_STD:
+                logger.warning(f"Overriding image_std from {current_std} to ImageNet standard {IMAGENET_STD}")
+                self.preprocessor_config["image_std"] = IMAGENET_STD
+        else:
+            logger.info(f"Setting image_std to ImageNet standard {IMAGENET_STD}")
+            self.preprocessor_config["image_std"] = IMAGENET_STD
+
+        # Now call parent which will use the corrected values
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        # Set projector type to GEMMA3N
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N)
+
+        # MobileNetV5 specific parameters
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)  # MobileNetV5 uses approximate GELU
+
+        # Image sequence length (256 tokens = 16x16 for Gemma3n)
+        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        # Note: Additional metadata can be added as needed
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # Force quantization settings for specific tensor types
+        if "input_projection" in name or "input_proj" in name:
+            return gguf.GGMLQuantizationType.F16
+        if ".embeddings." in name or "stem" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # Gemma3n uses different prefixes than other models:
+        # - model.embed_vision.* for projection layers
+        # - model.vision_tower.* for vision encoder
+        # Skip non-vision tensors
+        if not (name.startswith("model.embed_vision.") or
+                name.startswith("model.vision_tower.")):
+            return []
+
+        # Strip "model." prefix to match expected llama.cpp format
+        if name.startswith("model."):
+            name = name[6:]  # Remove "model." prefix
+
+        # Process MobileNetV5 and projection tensors
+        name = name.replace("_weight", ".weight")
+
+        # Rename embed_vision to match our C++ implementation expectations
+        name = name.replace("embed_vision.", "")
+
+        # Rename vision_tower.timm_model to vision_tower for cleaner naming
+        name = name.replace("vision_tower.timm_model.", "vision_tower.")
+
+        # Handle normalization layer naming
+        name = name.replace("hard_embedding_norm", "hard_emb_norm")
+        name = name.replace("soft_embedding_norm", "soft_emb_norm")
+        # name = name.replace("embedding_post_projection_norm", "post_proj_norm")
+
+        # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed
+        # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1
+        if "soft_emb_norm.weight" in name:
+            # No correction needed for Gemma3n
+            pass
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def map_tensor_name(self, name: str) -> str:
+        """Map Gemma3n tensor names to GGUF format"""
+        # Projector tensors (from embed_vision) - use mm. prefix like Gemma3
+        # IMPORTANT: Keep the .weight suffix to match C++ expectations
+        if name == "embedding.weight":
+            return "mm.embedding.weight"
+        if name == "embedding_projection.weight":
+            return "mm.input_projection.weight"  # Main projection used by C++
+        if name == "hard_emb_norm.weight":
+            return "mm.hard_emb_norm.weight"  # Hard embedding normalization
+        if name == "soft_emb_norm.weight":
+            return "mm.soft_emb_norm.weight"  # Soft embedding normalization (used by C++)
+        if name == "post_proj_norm.weight":
+            return "mm.post_proj_norm.weight"  # Post projection normalization (CRITICAL for Gemma3n)
+
+        # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder
+        if name.startswith("vision_tower."):
+            # Remove vision_tower prefix and add v.enc. prefix
+            tensor_suffix = name[13:]  # Remove "vision_tower."
+            return f"v.enc.{tensor_suffix}"
+
+        # If no match, try parent implementation
+        try:
+            return super().map_tensor_name(name)
+        except ValueError:
+            # If parent also can't map it, provide a sensible default
+            # This shouldn't happen, but provides a fallback
+            logger.warning(f"Using fallback mapping for tensor: {name}")
+            return f"v.{name}"
+
 
-@ModelBase.register("Gemma3nForConditionalGeneration")
+@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
 class Gemma3NModel(Gemma3Model):
     model_arch = gguf.MODEL_ARCH.GEMMA3N
     norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
@@ -5983,8 +6161,43 @@ def __init__(self, *args, **kwargs):
         ]
 
     def set_vocab(self):
+        # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
+        # which includes special tokens from 262144-262399 for vision/audio.
+        # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
+        # Temporarily override the hparams lookup order to prioritize vocab_size.
+
+        # Store original vocab_size_per_layer_input if it exists
+        vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
+
+        # Temporarily remove vocab_size_per_layer_input to force using vocab_size
+        if vocab_size_per_layer_input is not None:
+            del self.hparams["vocab_size_per_layer_input"]
+
+        # Call parent set_vocab which will now use vocab_size (262400)
         super().set_vocab()
 
+        # Restore vocab_size_per_layer_input for later use
+        if vocab_size_per_layer_input is not None:
+            self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
+
+        # Fix chat template for Gemma3n multimodal: replace special token placeholders with mtmd markers
+        # The mtmd library uses <__media__> as the default marker for images/audio
+        # but Gemma3n's chat template uses <image_soft_token> and <audio_soft_token>
+        chat_template_key = "tokenizer.chat_template"
+        for kv_dict in self.gguf_writer.kv_data:
+            if chat_template_key in kv_dict:
+                template_value = kv_dict[chat_template_key].value
+
+                # Replace soft token placeholders with mtmd markers
+                if '<image_soft_token>' in template_value or '<audio_soft_token>' in template_value:
+                    logger.info("Fixing Gemma3n chat template: replacing soft token placeholders with mtmd markers")
+                    template_value = template_value.replace('<image_soft_token>', '<__media__>')
+                    template_value = template_value.replace('<audio_soft_token>', '<__media__>')
+
+                    # Update the value in place
+                    kv_dict[chat_template_key].value = template_value
+                break
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
@@ -6020,8 +6233,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if "language_model." not in name:
             return [] # skip non-language model tensors
 
+        # Pad token embeddings for vision/audio special tokens (262144-262399)
+        if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
+            # Move to CPU to avoid meta device issues during padding
+            data_torch = data_torch.to(device="cpu")
+
+            vocab_size = self.hparams.get("vocab_size", 262400)
+            current_size = data_torch.shape[0]  # First dimension is vocab_size
+
+            if current_size < vocab_size:
+                # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
+                padding_size = vocab_size - current_size
+                tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
+                logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
+
+                # Create padding with zeros (vision tokens won't use these embeddings)
+                padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
+                data_torch = torch.cat([data_torch, padding], dim=0)
+
+            # Continue with normal processing
+            name = name.replace("language_model.", "")
+            return [(self.map_tensor_name(name), data_torch)]
+
         if "altup_unembed_projections" in name:
             data_torch = data_torch.to(device="cpu")
+            # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
+            # They should NOT be padded
             if ".0." in name:
                 self._altup_unembd[0] = data_torch
             elif ".1." in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cab8f2901ae..41654b22b5d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -456,6 +456,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
     RESAMPLER = auto()
     GLM_EDGE  = auto()
     MERGER    = auto()
+    GEMMA3N   = auto()
     GEMMA3    = auto()
     QWEN3VL   = auto()
     COGVLM    = auto()
@@ -3397,6 +3398,7 @@ def get_type(val: Any) -> GGUFValueType:
 
 class VisionProjectorType:
     GEMMA3 = "gemma3"
+    GEMMA3N = "gemma3n"
     IDEFICS3 = "idefics3"
     PIXTRAL = "pixtral"
     LLAMA4 = "llama4"

From ad5ed98d7068f50447867f238c4cb1a9e1e29f3c Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sat, 20 Dec 2025 20:20:54 +0000
Subject: [PATCH 02/20] Add mobilenetv5 impl

---
 src/models/gemma3n-iswa.cpp       |  55 +++-
 tools/mtmd/CMakeLists.txt         |   1 +
 tools/mtmd/clip-graph.h           |  32 ++
 tools/mtmd/clip-impl.h            |  43 +++
 tools/mtmd/clip-model.h           |  56 ++++
 tools/mtmd/clip.cpp               | 521 ++++++++++++++++++++++++++++++
 tools/mtmd/clip.h                 |   1 +
 tools/mtmd/models/mobilenetv5.cpp | 247 ++++++++++++++
 tools/mtmd/models/models.h        |   5 +
 tools/mtmd/mtmd.cpp               |   5 +-
 10 files changed, 963 insertions(+), 3 deletions(-)
 create mode 100644 tools/mtmd/models/mobilenetv5.cpp

diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp
index a0bdd6a15a1..7a6a446eb20 100644
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -259,7 +259,60 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
         inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
         cb(inp_per_layer, "inp_per_layer_selected", -1);
     } else {
-        GGML_ABORT("TODO: support embd input");
+        // For embedding inputs (e.g., from vision encoder)
+        // CRITICAL FIX: Vision tokens should use the padding token (ID=0) embedding
+        // from tok_embd_per_layer, NOT project the vision embeddings.
+        // The projection happens later in project_per_layer_inputs().
+        // This matches PyTorch behavior:
+        //   per_layer_inputs_tokens = torch.where(mask, input_ids, torch.zeros_like(input_ids))
+        //   per_layer_inputs = EmbedPerLayer(per_layer_inputs_tokens)  # Uses padding (0) for vision
+
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+        ggml_set_input(inp->embd);
+
+        // For vision, we need per_layer_inputs from padding token (ID=0)
+        // We CANNOT use inp->tokens because batch allows EITHER tokens OR embeddings
+        //
+        // The challenge: We need to broadcast padding token embedding from [embd_size, 1] to [embd_size, n_tokens]
+        // but ggml_repeat+ggml_dup doesn't work in no_alloc mode (creates views without backing memory).
+        //
+        // Solution: Use ggml_add to broadcast! GGML automatically broadcasts along compatible dimensions.
+        // We create zeros of shape [embd_size, n_tokens], then add padding_emb [embd_size, 1] which broadcasts.
+
+        // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer
+        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
+
+        // Create zeros tensor [embd_size, n_tokens] by projecting vision embeddings and multiplying by 0
+        // First, project inp->embd [n_embd, n_tokens] to per-layer space [embd_size, n_tokens]
+        ggml_tensor * zeros_per_layer = ggml_mul_mat(ctx0, model.per_layer_model_proj, inp->embd);
+        zeros_per_layer = ggml_scale(ctx0, zeros_per_layer, 0.0f);  // Multiply by 0 to get zeros
+        ggml_set_name(zeros_per_layer, "zeros_per_layer");
+
+        // Extract column 0 (padding token's embedding) as a vector: [embd_size]
+        // Note: tok_embd_per_layer is quantized (q8_0), so the view is also q8_0
+        ggml_tensor * padding_embd_vec_q = ggml_view_1d(ctx0, model.tok_embd_per_layer,
+                                                         embd_size,  // number of elements
+                                                         0);         // offset (column 0)
+        ggml_set_name(padding_embd_vec_q, "padding_token_emb_q8");
+
+        // Dequantize to f32 using ggml_cpy
+        ggml_tensor * padding_embd_vec_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
+        ggml_tensor * padding_embd_vec = ggml_cpy(ctx0, padding_embd_vec_q, padding_embd_vec_f32);
+        ggml_set_name(padding_embd_vec, "padding_token_emb_f32");
+
+        // Reshape to [embd_size, 1] for broadcasting
+        ggml_tensor * padding_embd_col = ggml_reshape_2d(ctx0, padding_embd_vec, embd_size, 1);
+
+        // Add: zeros [embd_size, n_tokens] + padding [embd_size, 1] = broadcasted padding [embd_size, n_tokens]
+        ggml_tensor * inp_per_layer_flat = ggml_add(ctx0, zeros_per_layer, padding_embd_col);
+        ggml_set_name(inp_per_layer_flat, "inp_per_layer_broadcasted");
+
+        // Reshape to [n_embd_altup, n_layer, n_tokens] for per-layer processing
+        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer_flat, n_embd_altup, n_layer, n_tokens);
+
+        // Apply same scaling as text tokens
+        // inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
+        cb(inp_per_layer, "inp_per_layer_vision", -1);
     }
     res->add_input(std::move(inp));
     return inp_per_layer;
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 317d5f19fd9..a74b4bc2154 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -27,6 +27,7 @@ add_library(mtmd
             models/qwen3vl.cpp
             models/siglip.cpp
             models/whisper-enc.cpp
+            models/mobilenetv5.cpp
             )
 
 set_target_properties(mtmd PROPERTIES
diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 2b1915779f2..5d8c46862bd 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -70,6 +70,38 @@ struct clip_graph {
 
     ggml_tensor * build_inp_raw(int channels = 3);
 
+    ggml_tensor * rms_norm_2d(
+        ggml_tensor * inp, 
+        ggml_tensor * weight, 
+        float eps = 1e-6f, 
+        int block_idx=-1);
+    
+    ggml_tensor* pad_same_2d(
+        ggml_tensor* inp, 
+        int kernel_h, 
+        int kernel_w, 
+        int stride_h, 
+        int stride_w, 
+        int dilation_h = 1, 
+        int dilation_w = 1);
+        
+    ggml_tensor * build_edge_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride,
+        int block_idx = -1);
+
+    ggml_tensor * build_inverted_residual(
+        ggml_tensor * inp, 
+        const mobilenetv5_block & block, 
+        int stride, 
+        int block_idx = -1);
+
+    ggml_tensor * build_mobilenet_attn(
+        ggml_tensor * inp, 
+        const mobilenetv5_block & block, 
+        int block_idx = -1);
+
     ggml_tensor * build_norm(
             ggml_tensor * cur,
             ggml_tensor * mw,
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index a0939865e3f..24a1ef52d08 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -153,6 +153,47 @@
 #define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
 
+// mobilenetv5 (gemma3n) definitions
+#define TN_MNV5_STEM_CONV        "v.enc.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS        "v.enc.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN          "v.enc.conv_stem.bn.weight"
+
+// Stage 0 Block (Edge Residual)
+#define TN_MNV5_BLK_S0_EXP_W     "v.enc.blocks.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W     "v.enc.blocks.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W     "v.enc.blocks.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W     "v.enc.blocks.%d.%d.bn2.weight"
+
+// Stage 1+ Block (Universal Inverted Residual)
+#define TN_MNV5_BLK_DW_START_W   "v.enc.blocks.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN  "v.enc.blocks.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W     "v.enc.blocks.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN    "v.enc.blocks.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W     "v.enc.blocks.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN    "v.enc.blocks.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W    "v.enc.blocks.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN   "v.enc.blocks.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE  "v.enc.blocks.%d.%d.layer_scale.gamma"
+
+// Attention Components
+#define TN_MNV5_ATTN_Q_W         "v.enc.blocks.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W         "v.enc.blocks.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W         "v.enc.blocks.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W         "v.enc.blocks.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW        "v.enc.blocks.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM      "v.enc.blocks.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW        "v.enc.blocks.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM      "v.enc.blocks.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM        "v.enc.blocks.%d.%d.norm.weight" // Block norm used in attn blocks
+
+// MSFA
+#define TN_MNV5_MSFA_FFN_EXP_W   "v.enc.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN  "v.enc.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W  "v.enc.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.enc.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM        "v.enc.msfa.norm.weight"
+
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 
@@ -170,6 +211,7 @@ enum projector_type {
     PROJECTOR_TYPE_QWEN2VL,
     PROJECTOR_TYPE_QWEN3VL,
     PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_GEMMA3N,
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
@@ -200,6 +242,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
     { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_GEMMA3N,   "gemma3n"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
     { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index b4c31cdde6b..e03f455b1b5 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -172,6 +172,45 @@ struct clip_layer {
     }
 };
 
+// Expanded MobileNetV5 block structure for Gemma3n vision encoder
+struct mobilenetv5_block {
+    // Stage 0 (Edge Residual)
+    ggml_tensor * s0_conv_exp_w = nullptr;
+    ggml_tensor * s0_bn1_w      = nullptr;
+    ggml_tensor * s0_conv_pwl_w = nullptr;
+    ggml_tensor * s0_bn2_w      = nullptr;
+
+    // Stage 1+ (Universal Inverted Residual)
+    ggml_tensor * dw_start_w    = nullptr;
+    ggml_tensor * dw_start_bn_w = nullptr;
+    
+    ggml_tensor * pw_exp_w      = nullptr;
+    ggml_tensor * pw_exp_bn_w   = nullptr;
+
+    ggml_tensor * dw_mid_w      = nullptr;
+    ggml_tensor * dw_mid_bn_w   = nullptr;
+
+    ggml_tensor * pw_proj_w     = nullptr;
+    ggml_tensor * pw_proj_bn_w  = nullptr;
+
+    ggml_tensor * layer_scale_w = nullptr;
+
+    // Attention (MQA) components
+    ggml_tensor * attn_q_w = nullptr;
+    ggml_tensor * attn_k_w = nullptr;
+    ggml_tensor * attn_v_w = nullptr;
+    ggml_tensor * attn_o_w = nullptr;
+    
+    // Optional downsampling/norm in attention
+    ggml_tensor * attn_k_dw_w   = nullptr;
+    ggml_tensor * attn_k_norm_w = nullptr;
+    ggml_tensor * attn_v_dw_w   = nullptr;
+    ggml_tensor * attn_v_norm_w = nullptr;
+    
+    // Block norm (often present in attention blocks)
+    ggml_tensor * attn_norm_w   = nullptr;
+};
+
 struct clip_model {
     clip_modality modality = CLIP_MODALITY_VISION;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -288,6 +327,23 @@ struct clip_model {
     ggml_tensor * mm_input_proj_w = nullptr;
     ggml_tensor * mm_soft_emb_norm_w = nullptr;
 
+    // mobilenetv5 for gemma3n
+    std::vector<mobilenetv5_block> mobilenet_blocks;
+    std::vector<int> mobilenet_stage_ends; // NEW: Track end indices of stages
+    ggml_tensor * mobilenet_stem_conv_w = nullptr;
+    ggml_tensor * mobilenet_stem_conv_b = nullptr;
+    ggml_tensor * mobilenet_stem_norm_w = nullptr;
+    ggml_tensor * mm_post_proj_norm_w = nullptr;
+
+    // Multi-Scale Fusion Adapter (MSFA) components
+    ggml_tensor * msfa_concat_conv_w = nullptr;      // Concatenated feature processing
+    ggml_tensor * msfa_concat_norm_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_w = nullptr;       // FFN expansion
+    ggml_tensor * msfa_ffn_project_w = nullptr;      // FFN projection
+    ggml_tensor * msfa_ffn_expand_bn = nullptr;      // NEW: FFN expansion batch norm
+    ggml_tensor * msfa_ffn_project_bn = nullptr;    // NEW: FFN projection batch norm   
+
+
     // pixtral, glm4v
     ggml_tensor * token_embd_img_break = nullptr;
     ggml_tensor * mm_patch_merger_w = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 3ba0823defb..4c357aab19e 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -263,6 +263,378 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
     }
 }
 
+// Helper: Normalize over the Channel dimension (dim 2 in [W, H, C, B])
+// RMS Norm 2D - normalizes over channels for each spatial position
+// PyTorch: v = torch.mean(x.pow(2), dim=1) - mean over C for each (N,H,W)
+// We need to normalize each spatial position across its C channels
+ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps, int block_idx) {
+    // inp: [W, H, C, B]
+    const int64_t W = inp->ne[0];
+    const int64_t H = inp->ne[1];
+    const int64_t C = inp->ne[2];
+    const int64_t B = inp->ne[3];
+
+    // Step 1: Permute [W, H, C, B] -> [C, W, H, B]
+    // Puts Channels in ne[0] (contiguous)
+    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    // Step 2: Reshape [C, W, H, B] -> [C, W*H*B]
+    // We now have a 2D matrix where columns are Channels (ne[0]) 
+    // and rows are Spatial/Batch (ne[1]).
+    // cur = ggml_reshape_2d(ctx0, cur, C, W * H * B);
+
+    // REMOVED Step 3 (Transpose). 
+    // We WANT ne[0] to be C so rms_norm reduces over it.
+
+    // Step 4: Apply RMS Norm
+    // Normalizes ne[0] (C) for every element in ne[1] (Spatial/Batch).
+    cur = ggml_rms_norm(ctx0, cur, eps);
+
+    // Step 5: Apply weight if present
+    if (weight) {
+        // weight is [C]
+        // cur is [C, W*H*B]
+        // ggml_mul broadcasts automatically along higher dims.
+        // It multiplies element i of weight with element i of cur's ne[0].
+        cur = ggml_mul(ctx0, cur, weight);
+    }
+
+    // REMOVED Step 6 (Transpose back). We never transposed.
+
+    // Step 7: Reshape back to [C, W, H, B]
+    // cur = ggml_reshape_4d(ctx0, cur, C, W, H, B);
+
+    // Step 8: Permute back to [W, H, C, B]
+    // ne[0]=C, ne[1]=W, ne[2]=H, ne[3]=B
+    // We want new ne[0] to be old ne[1] (W)
+    // We want new ne[1] to be old ne[2] (H)
+    // We want new ne[2] to be old ne[0] (C)
+    // We want new ne[3] to be old ne[3] (B)
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+
+    // cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+    
+    // Note: The second permute in your original code was likely redundant/incorrect
+    // after the first one. A single permute is sufficient to restore order.
+    cur = ggml_cont(ctx0, cur);
+
+    return cur;
+}
+
+
+// ------------------------------------------------------------------------
+// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF)
+// ------------------------------------------------------------------------
+ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+    const int64_t ih = inp->ne[1];  // height
+    const int64_t iw = inp->ne[0];  // width
+
+    // Calculate output size (ceil division)
+    const int64_t oh = (ih + stride_h - 1) / stride_h;
+    const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+    // Calculate padding needed
+    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+    // Split padding asymmetrically
+    const int pad_h_top = pad_h / 2;
+    const int pad_h_bottom = pad_h - pad_h_top;
+    const int pad_w_left = pad_w / 2;
+    const int pad_w_right = pad_w - pad_w_left;
+
+    // Apply padding if needed
+    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+    if (pad_h > 0 || pad_w > 0) {
+        inp = ggml_pad_ext(ctx0, inp,
+            pad_w_left, pad_w_right,     // width padding (dim 0)
+            pad_h_top, pad_h_bottom,      // height padding (dim 1)
+            0, 0,                         // no channel padding (dim 2)
+            0, 0);                        // no batch padding (dim 3)
+    }
+
+    return inp;
+}
+
+// ------------------------------------------------------------------------
+// Edge Residual Block (Stage 0) - CORRECTED
+// ------------------------------------------------------------------------
+ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) {
+    ggml_tensor * cur = inp;
+
+    // 1. Expansion Conv (3x3)
+    // --------------------------------------------------------------------
+    // LOGIC FIX:
+    // Block 0 (stride=2): Uses "Conv2dSame". We must manually pad, then conv with pad=0.
+    // Block 1,2 (stride=1): Uses standard "Conv2d" with padding=(1,1).
+    // --------------------------------------------------------------------
+    
+    if (stride == 2) {
+        // Case: Downsampling (Block 0)
+        // Replicates Conv2dSame(kernel=3, stride=2)
+        // We calculate asymmetric padding dynamically
+        cur = pad_same_2d(cur, 3, 3, stride, stride); 
+        
+        // Perform conv with 0 padding because we just applied it manually
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+    } else {
+        // Case: Normal 3x3 Block (Block 1, 2)
+        // Replicates Conv2d(kernel=3, stride=1, padding=1)
+        // Standard symmetric padding of 1 is sufficient for 3x3 s1 to keep dims same
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+    }
+
+    // BN + Activation
+    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+    cur = ggml_gelu(ctx0, cur);
+
+    // 2. Pointwise Linear Conv (1x1)
+    // 1x1 Convs usually have padding=0 and stride=1
+    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+    // 3. Residual Connection
+    // Only apply residual if spatial dimensions and channels match (stride 1)
+    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) {
+    ggml_tensor * cur = inp;
+
+    // 1. Depthwise Start (Optional)
+    // NOTE: dw_start always has stride=1 (no downsampling here)
+    if (block.dw_start_w) {
+        int k = block.dw_start_w->ne[0]; // 3 or 5
+        int p = k / 2;
+        // cur = ggml_conv_2d_dw_direct(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+    }
+
+    // 2. Pointwise Expansion (1x1)
+    if (block.pw_exp_w) {
+        // Standard 1x1 conv, pad=0, stride=1
+        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 3. Depthwise Mid (Optional)
+    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+    if (block.dw_mid_w) {
+        int k = block.dw_mid_w->ne[0]; // 3 or 5
+        
+        if (stride > 1) {
+            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+            cur = pad_same_2d(cur, k, k, stride, stride);
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+        } else {
+            // Case: Stride 1 -> Use Standard Symmetric Padding
+            int p = k / 2;
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+        }
+
+        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 4. Pointwise Projection (1x1)
+    if (block.pw_proj_w) {
+        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+    }
+
+    // Apply Layer Scaling if present
+    if (block.layer_scale_w) {
+        ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
+            1, 1, block.layer_scale_w->ne[0], 1);
+        
+        cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+    }
+
+    // 5. Residual Connection
+    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+    bool same_channel = (inp->ne[2] == cur->ne[2]);
+    if (same_spatial && same_channel) {
+        // --- FIXED LAYER SCALING ---
+        // ---------------------------
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// MobileNetV5 Builder (Gemma 3n) - Attention Block
+ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block, int block_idx) {
+
+    // ... [Debug Helpers kept same as original] ...
+    // auto DEBUG_SHAPE = [&](const char* label, ggml_tensor* t) { /* ... */ };
+    // auto REGISTER_DEBUG = [&](const std::string& name, ggml_tensor* t) { /* ... */ };
+
+    // // Debug input
+    // if (block_idx == 33 || block_idx == 50 || block_idx == 52) {
+    //     char debug_name[128];
+    //     snprintf(debug_name, sizeof(debug_name), "block%d_input", block_idx);
+    //     REGISTER_DEBUG(debug_name, inp);
+    // }
+
+    ggml_tensor * cur = inp;
+
+    // --- Norm ---
+    if (block.attn_norm_w) {
+        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f, block_idx);
+    }
+
+    // --- 1. Q Calculation ---
+    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+    // --- 2. K Calculation (Downsampled) ---
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * k_inp = cur;
+    if (block.attn_k_dw_w) {
+        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
+        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
+        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_k_norm_w) {
+            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f, block_idx);
+        }
+    }
+    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+    // --- 3. V Calculation (Downsampled) ---
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * v_inp = cur;
+    if (block.attn_v_dw_w) {
+        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
+        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
+        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_v_norm_w) {
+            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f, block_idx);
+        }
+    }
+    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+    // --- Reshape & Permute Logic ---
+
+    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+    const int D = k->ne[2]; // Head dimension
+    const int n_head = q->ne[2] / D;
+    const int N = W * H;
+
+    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+    q = ggml_cont(ctx0, q);
+
+    const int Wk = k->ne[0]; const int Hk = k->ne[1];
+    const int M = Wk * Hk; 
+
+    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+    k = ggml_reshape_3d(ctx0, k, M, D, B);
+    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+    k = ggml_cont(ctx0, k);
+
+    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+    // NOTE: We keep V as [M, D] because ggml_mul_mat expects src0^T * src1.
+    // To get output [D, N], we will need [M, D]^T * [M, N].
+    v = ggml_reshape_3d(ctx0, v, M, D, B);
+    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+    v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+    // --- Multi-Query Attention ---
+    float scale = 1.0f / sqrtf((float)D);
+
+    // Step 1: Compute Q @ K.T
+    // Q: [D, N, n_head, B]
+    // K: [D, M, 1, B]
+    // ggml_mul_mat computes K^T * Q  -> [D, M]^T * [D, N] -> [M, D] * [D, N] -> [M, N]
+    // Implicit Broadcast: K has 1 head, Q has n_head. ggml handles this automatically.
+    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); // Result: [M, N, n_head, B] (in ggml layout)
+
+    // // Debug scores
+    // if (block_idx == 33) {
+    //      char debug_name[128];
+    //      snprintf(debug_name, sizeof(debug_name), "block%d_scores_raw", block_idx);
+    //      REGISTER_DEBUG(debug_name, scores);
+    // }
+
+    scores = ggml_scale(ctx0, scores, scale);
+
+    // Step 2: Softmax
+    // scores is [M, N, n_head, B] (ne0=M, ne1=N)
+    // We need softmax over M (keys).
+    // ggml_soft_max applies to dim 0, which is M. Perfect - no permute needed!
+    scores = ggml_soft_max(ctx0, scores);
+
+    // Step 3: Compute Attn @ V
+    // V:      [M, D, 1, B] (ne0=M, ne1=D)
+    // Scores: [M, N, n_head, B] (ne0=M, ne1=N)
+    //
+    // ggml_mul_mat computes V^T * Scores -> [M, D]^T * [M, N] -> [D, M] * [M, N] -> [D, N]
+    // Implicit Broadcast: V has 1 head, Scores has n_head. ggml handles this automatically.
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); // Result: [N, D, n_head, B]
+
+    // // Debug kqv
+    // if (block_idx == 33) {
+    //      char debug_name[128];
+    //      snprintf(debug_name, sizeof(debug_name), "block%d_kqv_out", block_idx);
+    //      REGISTER_DEBUG(debug_name, kqv);
+    // }
+
+    // --- Reshape back to spatial layout ---
+    // kqv is [N, D, n_head, B]. We want [D, N, n_head, B] to merge heads.
+    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); // [D, N, n_head, B]
+    kqv = ggml_cont(ctx0, kqv);
+    
+    // Reshape to [N, D*n_head, B] then [W, H, C, B]
+    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+    kqv = ggml_cont(ctx0, kqv);
+
+// Output projection
+    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+    // --- Residual & Layer Scale (FIXED) ---
+    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+        if (block.layer_scale_w) {
+            // FIX: Simplified Layer Scale. No permute needed.
+            // Tensor is [W, H, C, B]. Weight is [C].
+            // We reshape Weight to [1, 1, C, 1].
+            // GGML will broadcast W and H dimensions automatically.
+
+            // Debug print shape of block.layer_scale_w
+            // fprintf(stderr, "DEBUG: block %d layer_scale_w shape: [%ld x %ld x %ld x %ld]\n", block_idx, block.layer_scale_w->ne[0], block.layer_scale_w->ne[1], block.layer_scale_w->ne[2], block.layer_scale_w->ne[3]);
+
+            // Debug print shape of cur before scaling
+            // fprintf(stderr, "DEBUG: block %d cur shape before scaling: [%ld x %ld x %ld x %ld]\n", block_idx, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
+
+
+            ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
+                1, 1, block.layer_scale_w->ne[0], 1);
+
+            // Debug print shape of scale_w_reshaped
+            // fprintf(stderr, "DEBUG: block %d scale_w_reshaped shape: [%ld x %ld x %ld x %ld]\n", block_idx, scale_w_reshaped->ne[0], scale_w_reshaped->ne[1], scale_w_reshaped->ne[2], scale_w_reshaped->ne[3]);
+                
+            cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+        }
+        
+        // Residual Addition
+        // 'cur' is the pointer to the graph node of the attention output.
+        // 'inp' is the pointer to the graph node of the block input.
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
 // siglip2 naflex
 ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
     ggml_tensor * pos_embd = model.position_embeddings;
@@ -788,6 +1160,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_siglip>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_GEMMA3N:
+            {
+                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
+            } break;
         case PROJECTOR_TYPE_PIXTRAL:
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
@@ -1141,6 +1517,14 @@ struct clip_model_loader {
                         // test model (tinygemma3) has a different value, we optionally read it
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                     } break;
+
+                case PROJECTOR_TYPE_GEMMA3N:
+                    {
+                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
+                        // Similar configuration to Gemma3
+                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                    } break;
                 case PROJECTOR_TYPE_QWEN2VL:
                 case PROJECTOR_TYPE_QWEN25VL:
                 case PROJECTOR_TYPE_QWEN3VL:
@@ -1381,6 +1765,7 @@ struct clip_model_loader {
             }
         }
 
+
         switch (model.proj_type) {
             case PROJECTOR_TYPE_MLP:
             case PROJECTOR_TYPE_MLP_NORM:
@@ -1512,6 +1897,106 @@ struct clip_model_loader {
                     model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                     model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                 } break;
+            case PROJECTOR_TYPE_GEMMA3N:
+                {
+                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
+                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
+                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
+
+                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
+                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
+                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
+                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
+                    
+                    // IMPORTANT: Your GGUF log shows 'v.enc.msfa.norm.weight' -> shape {2048}
+                    // Ensure TN_MNV5_MSFA_NORM matches this string
+                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
+
+                    // Dynamically load blocks stage by stage
+                    for (int stage = 0; stage < 4; ++stage) {
+                        int blocks_found_in_stage = 0;
+                        
+                        for (int blk_idx = 0; ; ++blk_idx) {
+                            bool found_block = false;
+                            mobilenetv5_block block;
+
+                            // 1. Check for Edge Residual (S0)
+                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
+                            if (block.s0_conv_exp_w) {
+                                found_block = true;
+                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
+                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
+                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
+                            } 
+                            // 2. Check for UIR (Universal Inverted Residual)
+                            else {
+                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
+                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
+                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
+
+                                if (block.dw_start_w || block.pw_exp_w) {
+                                    found_block = true;
+                                    if (block.dw_start_w) {
+                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
+                                    }
+                                    if (block.pw_exp_w) {
+                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
+                                    }
+                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
+                                    if (block.dw_mid_w) {
+                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
+                                    }
+                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
+                                    if (block.pw_proj_w) {
+                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
+                                    }
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            // 3. Check for Attention (MQA)
+                            // Even if UIR/Edge check failed, this might be a pure attention block
+                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
+                            if (attn_q_check) {
+                                found_block = true;
+                                block.attn_q_w = attn_q_check;
+                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
+                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
+                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
+                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
+                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
+                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
+                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
+                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
+                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
+                                if (!block.layer_scale_w) {
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            if (found_block) {
+                                model.mobilenet_blocks.push_back(block);
+                                blocks_found_in_stage++;
+                            } else {
+                                // End of blocks for this stage
+                                break;
+                            }
+                        }
+                        
+                        // Track where this stage ends in the flat vector
+                        if (blocks_found_in_stage > 0) {
+                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
+                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
+                        }
+                    }
+                    // Load projection weights (similar to Gemma3)
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+                    // model.mm_post_proj_norm_w = get_tensor(TN_MM_POST_PROJ_N);  // CRITICAL: Post projection norm
+                    // Load additional Gemma3n projection tensors
+                    model.mm_0_w = get_tensor("mm.embedding.weight", false);  // Input embedding
+                    model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false);  // Hard embedding norm
+                } break;
             case PROJECTOR_TYPE_IDEFICS3:
                 {
                     model.projection = get_tensor(TN_MM_PROJECTOR);
@@ -2052,6 +2537,18 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
+// Rescale image from u8 to f32 without normalization (for models like GEMMA3N that use SiglipImageProcessorFast)
+// This only converts from [0, 255] to [0.0, 1.0] range without applying mean/std normalization
+static void rescale_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(src.buf.size());
+ 
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<float>(src.buf[i]) / 255.0f;
+    }
+}
+
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
     dst.nx = src.nx;
@@ -2747,6 +3244,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 res_imgs->entries.push_back(std::move(img_f32));
             } break;
 
+        case PROJECTOR_TYPE_GEMMA3N:
+            {
+                // GEMMA3N uses SiglipImageProcessorFast which only rescales to [0.0, 1.0] without normalization
+                // Resize to 768x768 using bilinear interpolation, then rescale to f32
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                rescale_image_u8_to_f32(resized_image, *img_f32);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
         case PROJECTOR_TYPE_JANUS_PRO:
             {
                 // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
@@ -3006,6 +3515,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 int scale_factor = ctx->model.hparams.n_merge;
                 n_patches /= (scale_factor * scale_factor);
             } break;
+        case PROJECTOR_TYPE_GEMMA3N:
+            {
+                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
+                // regardless of input size (see architecture description)
+                n_patches = 16 * 16;  // 256 tokens
+            } break;
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
             {
@@ -3396,6 +3911,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 set_input_i32("patches", patches);
             } break;
         case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3N:
         case PROJECTOR_TYPE_IDEFICS3:
         case PROJECTOR_TYPE_INTERNVL:
         case PROJECTOR_TYPE_QWEN2A:
@@ -3521,6 +4037,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             // main path + deepstack paths
             return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
         case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3N:
             return ctx->model.mm_input_proj_w->ne[0];
         case PROJECTOR_TYPE_IDEFICS3:
             return ctx->model.projection->ne[1];
@@ -3575,6 +4092,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 
+bool clip_is_gemma3n(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3N;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_VISION;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 68a0d6e857e..c244df2677f 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -107,6 +107,7 @@ bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
+bool clip_is_gemma3n(const struct clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
new file mode 100644
index 00000000000..9946ca6afa8
--- /dev/null
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -0,0 +1,247 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+
+    fprintf(stderr, "\n--- START build_mobilenetv5 ---\n");
+
+    ggml_tensor * inp = build_inp_raw();
+
+    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
+
+    // ggml_tensor * mobilenet_stem_conv_w_fixed = fix_1x1_weight(model.mobilenet_stem_conv_w);
+
+    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
+    if (model.mobilenet_stem_conv_b) {
+        // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B]
+        ggml_tensor * bias = ggml_reshape_4d(ctx0, model.mobilenet_stem_conv_b, 1, 1, cur->ne[2], 1);
+        cur = ggml_add(ctx0, cur, bias);
+    }
+    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+    cur = ggml_gelu(ctx0, cur);
+
+
+    // 2. Blocks
+    std::vector<ggml_tensor*> intermediate_features;
+    const int total_blocks = model.mobilenet_blocks.size();
+    
+    auto is_stage_start = [&](int i) {
+        if (i == 0) return true;
+        for (int end_idx : model.mobilenet_stage_ends) {
+            if (i == end_idx + 1) return true;
+        }
+        return false;
+    };
+
+    auto is_fusion_point = [&](int i) {
+        if (model.mobilenet_stage_ends.size() >= 4) {
+                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+        } else {
+            if (i == total_blocks - 1) return true;
+        }
+        return false;
+    };
+
+    for (int i = 0; i < total_blocks; i++) {
+        const auto & block = model.mobilenet_blocks[i];
+        int stride = is_stage_start(i) ? 2 : 1;
+
+        // Debug block type
+        const char* block_type = block.s0_conv_exp_w ? "edge_residual" :
+                                    block.attn_q_w ? "attention" : "inverted_residual";
+
+        // // Debug input for problematic blocks
+        // if (i >= 50 && i <= 54) {
+        //     fprintf(stderr, "DEBUG: Block %d (%s) input shape: [%ld, %ld, %ld, %ld], stride=%d\n",
+        //             i, block_type, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], stride);
+        // }
+
+        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride, i);
+        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block, i);
+        else                          cur = build_inverted_residual(cur, block, stride, i);
+
+        // Register block output for debugging
+        char block_name[64];
+
+        if (is_fusion_point(i)) {
+
+            intermediate_features.push_back(cur);
+        }
+    }
+
+    // 3. Multi-Scale Fusion Adapter (MSFA) - REPLICATED & FIXED
+    if (!intermediate_features.empty()) {
+        
+        // A. Reference Resolution: PyTorch implementation uses inputs[0]
+        // We assume intermediate_features[0] is the "High Resolution" target.
+        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+        ggml_tensor* target_feat = intermediate_features[0];
+        int high_res_w = target_feat->ne[0];
+        int high_res_h = target_feat->ne[1];
+
+        std::vector<ggml_tensor*> resized_feats;
+
+        // B. Resize inputs to match inputs[0] (High Resolution)
+        for (auto feat : intermediate_features) {
+            int feat_w = feat->ne[0];
+            int feat_h = feat->ne[1];
+
+            // PyTorch: if feat_size < high_resolution: interpolate
+            if (feat_w < high_res_w || feat_h < high_res_h) {
+                // Calculate scale factor. 
+                // Note: PyTorch 'nearest' works on arbitrary float scales. 
+                // ggml_upscale generally takes integer factors or target sizes depending on helper.
+                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+                int scale_w = high_res_w / feat_w;
+                int scale_h = high_res_h / feat_h;
+                
+                // Safety check for non-integer scaling if strictly replicating
+                if (high_res_w % feat_w != 0) { 
+                    fprintf(stderr, "Warning: Non-integer scaling detected in MSFA\n"); 
+                }
+
+                // Upsample (Nearest Neighbor)
+                // 2 is the scale factor
+                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); 
+            }
+            resized_feats.push_back(feat);
+        }
+
+        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+        cur = resized_feats[0];
+        for (size_t k = 1; k < resized_feats.size(); ++k) {
+            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+        }
+
+        // D. FFN (UniversalInvertedResidual)
+        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+        
+        // 1. Expansion
+        if (model.msfa_ffn_expand_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+            
+            // MISSING IN YOUR CODE: Expansion Norm
+            if (model.msfa_ffn_expand_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); // Helper to apply RMSNorm
+            }
+            
+            cur = ggml_gelu(ctx0, cur);
+
+        }
+
+        // 2. Projection (No DW because kernel_size=0)
+        if (model.msfa_ffn_project_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+            
+            // MISSING IN YOUR CODE: Projection Norm
+            // UniversalInvertedResidual typically has a norm after projection
+            if (model.msfa_ffn_project_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+            }
+
+        }
+
+        // E. Final Downsample to Target Resolution (Output Resolution)
+        // PyTorch: matches self.output_resolution (e.g. 16x16)
+        const int target_out_res = 16; 
+        int current_w = cur->ne[0];
+
+        if (current_w > target_out_res) {
+            int s = current_w / target_out_res;
+            
+            // PyTorch Logic:
+            // if divisible: avg_pool
+            // if not divisible: bilinear interpolate (hard to do in pure ggml, usually assumed divisible here)
+            
+            if (current_w % target_out_res == 0) {
+                // Avg Pool: Kernel=s, Stride=s
+                cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+            } else {
+                // Fallback or Error: ggml doesn't easily support bilinear downsampling 
+                // without custom ops, but standard models usually stick to integer strides.
+                fprintf(stderr, "Error: Irregular downsampling stride required.\n");
+            }
+
+        }
+
+        // F. Final Norm
+        if (model.msfa_concat_norm_w) {
+            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+        }
+    }
+
+    // 4. Gemma 3n Multimodal Projection (Embedder) - FULL FIX
+    // Input: 'cur' is [Width, Height, Channels, Batch]
+    int W = cur->ne[0];
+    int H = cur->ne[1];
+    int C = cur->ne[2]; // Should be 2048
+    int B = cur->ne[3];
+
+    // 1. Permute and Flatten to [Channels, Tokens, Batch]
+    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+    cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // -> [C, W, H, B]
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+    cur = ggml_cont(ctx0, cur);
+
+
+    // 2. FEATURE SCALING (Missing in your original code)
+    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+    // This prevents the signal from vanishing during the subsequent RMSNorm.
+    const float scale_factor = sqrtf((float)C);
+    cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+    // 3. SOFT EMBEDDING NORM
+    // PyTorch: self._norm(x) * self.weight
+    // We must normalize regardless, then multiply if weight exists.
+    {
+        const float eps = 1e-6f; // Gemma3n uses 1e-6
+        cur = ggml_rms_norm(ctx0, cur, eps); 
+        
+        if (model.mm_soft_emb_norm_w) {
+            // Weight shape is (2048,) -> Element-wise broadcast multiply
+            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+        }
+
+    }
+
+    // 4. PROJECTION
+    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+    // Need to transpose for ggml_mul_mat which computes A^T * B
+    // This matches Gemma3's projection at line ~1319 which also transposes
+    if (model.mm_input_proj_w) {
+        // cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+        cur = ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+            cur);            
+
+    }
+
+    // 5. POST PROJECTION NORM
+    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+    // with_scale=False means weight is registered as buffer with value 1.0
+    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+    // NOTE: Vision embeddings intentionally have magnitude ~1, different from
+    // text embeddings at ~sqrt(n_embd). The model was trained with this mismatch.
+    {
+        const float eps = 1e-6f;
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
+        if (model.mm_post_proj_norm_w) {
+            // If weight is loaded, multiply (should be ~1.0 anyway)
+            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+        }
+    }
+
+
+    // cur = ggml_scale(ctx0, cur, scale_factor);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
\ No newline at end of file
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 8d6d4ef67be..3875285fe92 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -66,3 +66,8 @@ struct clip_graph_glm4v : clip_graph {
     clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
 };
+
+struct clip_graph_mobilenetv5 : clip_graph {
+    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index b9c4fa90980..2d970cf45c2 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -266,7 +266,7 @@ struct mtmd_context {
         }
 
         // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3) {
+        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3N) {
             // <start_of_image> ... (image embeddings) ... <end_of_image>
             img_beg = "<start_of_image>";
             img_end = "<end_of_image>";
@@ -858,7 +858,8 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
+    if (ctx->ctx_v && 
+        (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) {
         return true;
     }
     return false;

From f57705478749497f561d793e8fb7b2e0a2712b8f Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sat, 20 Dec 2025 20:46:21 +0000
Subject: [PATCH 03/20] Fix comments, remove unused vars

---
 src/models/gemma3n-iswa.cpp       |  11 +--
 tools/mtmd/clip-graph.h           |  12 +--
 tools/mtmd/clip-model.h           |  12 +--
 tools/mtmd/clip.cpp               | 157 ++++--------------------------
 tools/mtmd/models/mobilenetv5.cpp |  44 ++-------
 5 files changed, 36 insertions(+), 200 deletions(-)

diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp
index 7a6a446eb20..e172b9a79f8 100644
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -260,7 +260,7 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
         cb(inp_per_layer, "inp_per_layer_selected", -1);
     } else {
         // For embedding inputs (e.g., from vision encoder)
-        // CRITICAL FIX: Vision tokens should use the padding token (ID=0) embedding
+        // Vision tokens should use the padding token (ID=0) embedding
         // from tok_embd_per_layer, NOT project the vision embeddings.
         // The projection happens later in project_per_layer_inputs().
         // This matches PyTorch behavior:
@@ -270,15 +270,6 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
         inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
         ggml_set_input(inp->embd);
 
-        // For vision, we need per_layer_inputs from padding token (ID=0)
-        // We CANNOT use inp->tokens because batch allows EITHER tokens OR embeddings
-        //
-        // The challenge: We need to broadcast padding token embedding from [embd_size, 1] to [embd_size, n_tokens]
-        // but ggml_repeat+ggml_dup doesn't work in no_alloc mode (creates views without backing memory).
-        //
-        // Solution: Use ggml_add to broadcast! GGML automatically broadcasts along compatible dimensions.
-        // We create zeros of shape [embd_size, n_tokens], then add padding_emb [embd_size, 1] which broadcasts.
-
         // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer
         const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
 
diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 5d8c46862bd..6a9efb933e5 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -73,8 +73,7 @@ struct clip_graph {
     ggml_tensor * rms_norm_2d(
         ggml_tensor * inp, 
         ggml_tensor * weight, 
-        float eps = 1e-6f, 
-        int block_idx=-1);
+        float eps = 1e-6f);
     
     ggml_tensor* pad_same_2d(
         ggml_tensor* inp, 
@@ -88,19 +87,16 @@ struct clip_graph {
     ggml_tensor * build_edge_residual(
         ggml_tensor * inp,
         const mobilenetv5_block & block,
-        int stride,
-        int block_idx = -1);
+        int stride);
 
     ggml_tensor * build_inverted_residual(
         ggml_tensor * inp, 
         const mobilenetv5_block & block, 
-        int stride, 
-        int block_idx = -1);
+        int stride);
 
     ggml_tensor * build_mobilenet_attn(
         ggml_tensor * inp, 
-        const mobilenetv5_block & block, 
-        int block_idx = -1);
+        const mobilenetv5_block & block);
 
     ggml_tensor * build_norm(
             ggml_tensor * cur,
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index e03f455b1b5..be168b97ef2 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -329,19 +329,19 @@ struct clip_model {
 
     // mobilenetv5 for gemma3n
     std::vector<mobilenetv5_block> mobilenet_blocks;
-    std::vector<int> mobilenet_stage_ends; // NEW: Track end indices of stages
+    std::vector<int> mobilenet_stage_ends;
     ggml_tensor * mobilenet_stem_conv_w = nullptr;
     ggml_tensor * mobilenet_stem_conv_b = nullptr;
     ggml_tensor * mobilenet_stem_norm_w = nullptr;
     ggml_tensor * mm_post_proj_norm_w = nullptr;
 
     // Multi-Scale Fusion Adapter (MSFA) components
-    ggml_tensor * msfa_concat_conv_w = nullptr;      // Concatenated feature processing
+    ggml_tensor * msfa_concat_conv_w = nullptr;
     ggml_tensor * msfa_concat_norm_w = nullptr;
-    ggml_tensor * msfa_ffn_expand_w = nullptr;       // FFN expansion
-    ggml_tensor * msfa_ffn_project_w = nullptr;      // FFN projection
-    ggml_tensor * msfa_ffn_expand_bn = nullptr;      // NEW: FFN expansion batch norm
-    ggml_tensor * msfa_ffn_project_bn = nullptr;    // NEW: FFN projection batch norm   
+    ggml_tensor * msfa_ffn_expand_w = nullptr;
+    ggml_tensor * msfa_ffn_project_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_bn = nullptr;
+    ggml_tensor * msfa_ffn_project_bn = nullptr; 
 
 
     // pixtral, glm4v
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 4c357aab19e..9e4519c502b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -263,69 +263,26 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
     }
 }
 
-// Helper: Normalize over the Channel dimension (dim 2 in [W, H, C, B])
+// --- Helpers for MobileNetV5 Blocks ---
 // RMS Norm 2D - normalizes over channels for each spatial position
-// PyTorch: v = torch.mean(x.pow(2), dim=1) - mean over C for each (N,H,W)
-// We need to normalize each spatial position across its C channels
-ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps, int block_idx) {
+ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
     // inp: [W, H, C, B]
-    const int64_t W = inp->ne[0];
-    const int64_t H = inp->ne[1];
-    const int64_t C = inp->ne[2];
-    const int64_t B = inp->ne[3];
 
-    // Step 1: Permute [W, H, C, B] -> [C, W, H, B]
-    // Puts Channels in ne[0] (contiguous)
     ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
     cur = ggml_cont(ctx0, cur);
-
-    // Step 2: Reshape [C, W, H, B] -> [C, W*H*B]
-    // We now have a 2D matrix where columns are Channels (ne[0]) 
-    // and rows are Spatial/Batch (ne[1]).
-    // cur = ggml_reshape_2d(ctx0, cur, C, W * H * B);
-
-    // REMOVED Step 3 (Transpose). 
-    // We WANT ne[0] to be C so rms_norm reduces over it.
-
-    // Step 4: Apply RMS Norm
-    // Normalizes ne[0] (C) for every element in ne[1] (Spatial/Batch).
     cur = ggml_rms_norm(ctx0, cur, eps);
-
-    // Step 5: Apply weight if present
+ 
     if (weight) {
-        // weight is [C]
-        // cur is [C, W*H*B]
-        // ggml_mul broadcasts automatically along higher dims.
-        // It multiplies element i of weight with element i of cur's ne[0].
         cur = ggml_mul(ctx0, cur, weight);
     }
 
-    // REMOVED Step 6 (Transpose back). We never transposed.
-
-    // Step 7: Reshape back to [C, W, H, B]
-    // cur = ggml_reshape_4d(ctx0, cur, C, W, H, B);
-
-    // Step 8: Permute back to [W, H, C, B]
-    // ne[0]=C, ne[1]=W, ne[2]=H, ne[3]=B
-    // We want new ne[0] to be old ne[1] (W)
-    // We want new ne[1] to be old ne[2] (H)
-    // We want new ne[2] to be old ne[0] (C)
-    // We want new ne[3] to be old ne[3] (B)
     cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-
-    // cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-    
-    // Note: The second permute in your original code was likely redundant/incorrect
-    // after the first one. A single permute is sufficient to restore order.
     cur = ggml_cont(ctx0, cur);
 
     return cur;
 }
 
-
-// ------------------------------------------------------------------------
 // Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF)
-// ------------------------------------------------------------------------
 ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
     const int64_t ih = inp->ne[1];  // height
     const int64_t iw = inp->ne[0];  // width
@@ -358,31 +315,20 @@ ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_
     return inp;
 }
 
-// ------------------------------------------------------------------------
-// Edge Residual Block (Stage 0) - CORRECTED
-// ------------------------------------------------------------------------
-ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) {
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
     ggml_tensor * cur = inp;
 
     // 1. Expansion Conv (3x3)
-    // --------------------------------------------------------------------
-    // LOGIC FIX:
-    // Block 0 (stride=2): Uses "Conv2dSame". We must manually pad, then conv with pad=0.
-    // Block 1,2 (stride=1): Uses standard "Conv2d" with padding=(1,1).
-    // --------------------------------------------------------------------
-    
     if (stride == 2) {
         // Case: Downsampling (Block 0)
         // Replicates Conv2dSame(kernel=3, stride=2)
-        // We calculate asymmetric padding dynamically
         cur = pad_same_2d(cur, 3, 3, stride, stride); 
-        
-        // Perform conv with 0 padding because we just applied it manually
         cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
     } else {
         // Case: Normal 3x3 Block (Block 1, 2)
         // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        // Standard symmetric padding of 1 is sufficient for 3x3 s1 to keep dims same
         cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
     }
 
@@ -404,7 +350,7 @@ ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenet
     return cur;
 }
 
-ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) {
+ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
     ggml_tensor * cur = inp;
 
     // 1. Depthwise Start (Optional)
@@ -412,7 +358,6 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil
     if (block.dw_start_w) {
         int k = block.dw_start_w->ne[0]; // 3 or 5
         int p = k / 2;
-        // cur = ggml_conv_2d_dw_direct(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
         cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
         if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
     }
@@ -462,8 +407,6 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil
     bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
     bool same_channel = (inp->ne[2] == cur->ne[2]);
     if (same_spatial && same_channel) {
-        // --- FIXED LAYER SCALING ---
-        // ---------------------------
         cur = ggml_add(ctx0, cur, inp);
     }
 
@@ -471,24 +414,12 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil
 }
 
 // MobileNetV5 Builder (Gemma 3n) - Attention Block
-ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block, int block_idx) {
-
-    // ... [Debug Helpers kept same as original] ...
-    // auto DEBUG_SHAPE = [&](const char* label, ggml_tensor* t) { /* ... */ };
-    // auto REGISTER_DEBUG = [&](const std::string& name, ggml_tensor* t) { /* ... */ };
-
-    // // Debug input
-    // if (block_idx == 33 || block_idx == 50 || block_idx == 52) {
-    //     char debug_name[128];
-    //     snprintf(debug_name, sizeof(debug_name), "block%d_input", block_idx);
-    //     REGISTER_DEBUG(debug_name, inp);
-    // }
-
+ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
     ggml_tensor * cur = inp;
 
     // --- Norm ---
     if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f, block_idx);
+        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
     }
 
     // --- 1. Q Calculation ---
@@ -502,7 +433,7 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene
         k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
         k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
         if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f, block_idx);
+            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
         }
     }
     ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
@@ -515,13 +446,11 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene
         v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
         v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
         if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f, block_idx);
+            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
         }
     }
     ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
 
-    // --- Reshape & Permute Logic ---
-
     const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
     const int D = k->ne[2]; // Head dimension
     const int n_head = q->ne[2] / D;
@@ -543,8 +472,6 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene
     k = ggml_cont(ctx0, k);
 
     // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    // NOTE: We keep V as [M, D] because ggml_mul_mat expects src0^T * src1.
-    // To get output [D, N], we will need [M, D]^T * [M, N].
     v = ggml_reshape_3d(ctx0, v, M, D, B);
     v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
     v = ggml_cont(ctx0, v); // [M, D, 1, B]
@@ -553,82 +480,32 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene
     float scale = 1.0f / sqrtf((float)D);
 
     // Step 1: Compute Q @ K.T
-    // Q: [D, N, n_head, B]
-    // K: [D, M, 1, B]
-    // ggml_mul_mat computes K^T * Q  -> [D, M]^T * [D, N] -> [M, D] * [D, N] -> [M, N]
-    // Implicit Broadcast: K has 1 head, Q has n_head. ggml handles this automatically.
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); // Result: [M, N, n_head, B] (in ggml layout)
-
-    // // Debug scores
-    // if (block_idx == 33) {
-    //      char debug_name[128];
-    //      snprintf(debug_name, sizeof(debug_name), "block%d_scores_raw", block_idx);
-    //      REGISTER_DEBUG(debug_name, scores);
-    // }
+    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
 
     scores = ggml_scale(ctx0, scores, scale);
 
-    // Step 2: Softmax
-    // scores is [M, N, n_head, B] (ne0=M, ne1=N)
-    // We need softmax over M (keys).
-    // ggml_soft_max applies to dim 0, which is M. Perfect - no permute needed!
     scores = ggml_soft_max(ctx0, scores);
 
-    // Step 3: Compute Attn @ V
-    // V:      [M, D, 1, B] (ne0=M, ne1=D)
-    // Scores: [M, N, n_head, B] (ne0=M, ne1=N)
-    //
-    // ggml_mul_mat computes V^T * Scores -> [M, D]^T * [M, N] -> [D, M] * [M, N] -> [D, N]
-    // Implicit Broadcast: V has 1 head, Scores has n_head. ggml handles this automatically.
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); // Result: [N, D, n_head, B]
-
-    // // Debug kqv
-    // if (block_idx == 33) {
-    //      char debug_name[128];
-    //      snprintf(debug_name, sizeof(debug_name), "block%d_kqv_out", block_idx);
-    //      REGISTER_DEBUG(debug_name, kqv);
-    // }
-
-    // --- Reshape back to spatial layout ---
-    // kqv is [N, D, n_head, B]. We want [D, N, n_head, B] to merge heads.
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); // [D, N, n_head, B]
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
     kqv = ggml_cont(ctx0, kqv);
     
-    // Reshape to [N, D*n_head, B] then [W, H, C, B]
+
     kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
     kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
     kqv = ggml_cont(ctx0, kqv);
 
-// Output projection
+    // Output projection
     cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
 
     // --- Residual & Layer Scale (FIXED) ---
     if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
         if (block.layer_scale_w) {
-            // FIX: Simplified Layer Scale. No permute needed.
-            // Tensor is [W, H, C, B]. Weight is [C].
-            // We reshape Weight to [1, 1, C, 1].
-            // GGML will broadcast W and H dimensions automatically.
-
-            // Debug print shape of block.layer_scale_w
-            // fprintf(stderr, "DEBUG: block %d layer_scale_w shape: [%ld x %ld x %ld x %ld]\n", block_idx, block.layer_scale_w->ne[0], block.layer_scale_w->ne[1], block.layer_scale_w->ne[2], block.layer_scale_w->ne[3]);
-
-            // Debug print shape of cur before scaling
-            // fprintf(stderr, "DEBUG: block %d cur shape before scaling: [%ld x %ld x %ld x %ld]\n", block_idx, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
-
-
             ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
                 1, 1, block.layer_scale_w->ne[0], 1);
-
-            // Debug print shape of scale_w_reshaped
-            // fprintf(stderr, "DEBUG: block %d scale_w_reshaped shape: [%ld x %ld x %ld x %ld]\n", block_idx, scale_w_reshaped->ne[0], scale_w_reshaped->ne[1], scale_w_reshaped->ne[2], scale_w_reshaped->ne[3]);
-                
             cur = ggml_mul(ctx0, cur, scale_w_reshaped);
         }
-        
-        // Residual Addition
-        // 'cur' is the pointer to the graph node of the attention output.
-        // 'inp' is the pointer to the graph node of the block input.
         cur = ggml_add(ctx0, cur, inp);
     }
 
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 9946ca6afa8..88bd1e6fcb9 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -9,8 +9,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
     ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
 
-    // ggml_tensor * mobilenet_stem_conv_w_fixed = fix_1x1_weight(model.mobilenet_stem_conv_w);
-
     cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
     if (model.mobilenet_stem_conv_b) {
         // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B]
@@ -47,22 +45,9 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         const auto & block = model.mobilenet_blocks[i];
         int stride = is_stage_start(i) ? 2 : 1;
 
-        // Debug block type
-        const char* block_type = block.s0_conv_exp_w ? "edge_residual" :
-                                    block.attn_q_w ? "attention" : "inverted_residual";
-
-        // // Debug input for problematic blocks
-        // if (i >= 50 && i <= 54) {
-        //     fprintf(stderr, "DEBUG: Block %d (%s) input shape: [%ld, %ld, %ld, %ld], stride=%d\n",
-        //             i, block_type, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], stride);
-        // }
-
-        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride, i);
-        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block, i);
-        else                          cur = build_inverted_residual(cur, block, stride, i);
-
-        // Register block output for debugging
-        char block_name[64];
+        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
+        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
+        else                          cur = build_inverted_residual(cur, block, stride);
 
         if (is_fusion_point(i)) {
 
@@ -94,7 +79,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
                 // ggml_upscale generally takes integer factors or target sizes depending on helper.
                 // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
                 int scale_w = high_res_w / feat_w;
-                int scale_h = high_res_h / feat_h;
+                // int scale_h = high_res_h / feat_h;
                 
                 // Safety check for non-integer scaling if strictly replicating
                 if (high_res_w % feat_w != 0) { 
@@ -122,9 +107,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
             // 1x1 Conv
             cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
             
-            // MISSING IN YOUR CODE: Expansion Norm
             if (model.msfa_ffn_expand_bn) {
-                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); // Helper to apply RMSNorm
+                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
             }
             
             cur = ggml_gelu(ctx0, cur);
@@ -136,7 +120,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
             // 1x1 Conv
             cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
             
-            // MISSING IN YOUR CODE: Projection Norm
             // UniversalInvertedResidual typically has a norm after projection
             if (model.msfa_ffn_project_bn) {
                 cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
@@ -151,17 +134,11 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
         if (current_w > target_out_res) {
             int s = current_w / target_out_res;
-            
-            // PyTorch Logic:
-            // if divisible: avg_pool
-            // if not divisible: bilinear interpolate (hard to do in pure ggml, usually assumed divisible here)
-            
+
             if (current_w % target_out_res == 0) {
                 // Avg Pool: Kernel=s, Stride=s
                 cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
             } else {
-                // Fallback or Error: ggml doesn't easily support bilinear downsampling 
-                // without custom ops, but standard models usually stick to integer strides.
                 fprintf(stderr, "Error: Irregular downsampling stride required.\n");
             }
 
@@ -174,7 +151,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         }
     }
 
-    // 4. Gemma 3n Multimodal Projection (Embedder) - FULL FIX
+    // 4. Gemma 3n Multimodal Projection (Embedder)
     // Input: 'cur' is [Width, Height, Channels, Batch]
     int W = cur->ne[0];
     int H = cur->ne[1];
@@ -189,7 +166,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     cur = ggml_cont(ctx0, cur);
 
 
-    // 2. FEATURE SCALING (Missing in your original code)
+    // 2. FEATURE SCALING
     // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
     // This prevents the signal from vanishing during the subsequent RMSNorm.
     const float scale_factor = sqrtf((float)C);
@@ -227,8 +204,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
     // with_scale=False means weight is registered as buffer with value 1.0
     // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
-    // NOTE: Vision embeddings intentionally have magnitude ~1, different from
-    // text embeddings at ~sqrt(n_embd). The model was trained with this mismatch.
     {
         const float eps = 1e-6f;
         cur = ggml_rms_norm(ctx0, cur, eps);
@@ -239,9 +214,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         }
     }
 
-
-    // cur = ggml_scale(ctx0, cur, scale_factor);
-
     ggml_build_forward_expand(gf, cur);
     return gf;
 }
\ No newline at end of file

From 4589d3eb748c48a33446f6d1465cb8b9a65d3635 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sun, 21 Dec 2025 10:59:15 +0000
Subject: [PATCH 04/20] Fix permute and remove transpose of projection weights

---
 tools/mtmd/models/mobilenetv5.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 88bd1e6fcb9..6dd1a3d465d 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -160,7 +160,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     // 1. Permute and Flatten to [Channels, Tokens, Batch]
     // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
-    cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // -> [C, W, H, B]
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
     cur = ggml_cont(ctx0, cur);
     cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
     cur = ggml_cont(ctx0, cur);
@@ -193,11 +194,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // Need to transpose for ggml_mul_mat which computes A^T * B
     // This matches Gemma3's projection at line ~1319 which also transposes
     if (model.mm_input_proj_w) {
-        // cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
-        cur = ggml_mul_mat(ctx0,
-            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
-            cur);            
-
+        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);         
     }
 
     // 5. POST PROJECTION NORM

From 47423a295ba1c272d38b85f98e6da89be995b7c0 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sun, 21 Dec 2025 11:50:10 +0000
Subject: [PATCH 05/20] Fix comments, remove debugging prints from hf_to_gguf

---
 convert_hf_to_gguf.py             | 17 +++++------------
 tools/mtmd/models/mobilenetv5.cpp |  4 +---
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 36a7ed000af..dd94efe7ed0 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6018,22 +6018,16 @@ def find_vparam(self, keys: list[str], optional: bool = False) -> Any:
         return super().find_vparam(keys, optional)
 
     def set_gguf_parameters(self):
-        # MobileNetV5 requires ImageNet normalization values
-        # Override preprocessor_config to ensure correct values before calling super()
-        # IMAGENET_MEAN = [0.485, 0.456, 0.406]
-        # IMAGENET_STD = [0.229, 0.224, 0.225]
+        # MobileNetV5 does not use normalisation at all
         IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ]
         IMAGENET_STD = [0.5 , 0.5 , 0.5 ]
 
-        print("test")
-
         # Check if preprocessor_config has incorrect normalization values
         if "image_mean" in self.preprocessor_config:
             current_mean = self.preprocessor_config["image_mean"]
             if current_mean != IMAGENET_MEAN:
                 logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}")
                 self.preprocessor_config["image_mean"] = IMAGENET_MEAN
-            print("test2")
         else:
             logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}")
             self.preprocessor_config["image_mean"] = IMAGENET_MEAN
@@ -6060,7 +6054,6 @@ def set_gguf_parameters(self):
 
         # Image sequence length (256 tokens = 16x16 for Gemma3n)
         image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
-        # Note: Additional metadata can be added as needed
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         # Force quantization settings for specific tensor types
@@ -6110,17 +6103,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
     def map_tensor_name(self, name: str) -> str:
         """Map Gemma3n tensor names to GGUF format"""
         # Projector tensors (from embed_vision) - use mm. prefix like Gemma3
-        # IMPORTANT: Keep the .weight suffix to match C++ expectations
+        # IMPORTANT: Keep the .weight suffix to match ggml expectations
         if name == "embedding.weight":
             return "mm.embedding.weight"
         if name == "embedding_projection.weight":
-            return "mm.input_projection.weight"  # Main projection used by C++
+            return "mm.input_projection.weight"  # Main projection 
         if name == "hard_emb_norm.weight":
             return "mm.hard_emb_norm.weight"  # Hard embedding normalization
         if name == "soft_emb_norm.weight":
-            return "mm.soft_emb_norm.weight"  # Soft embedding normalization (used by C++)
+            return "mm.soft_emb_norm.weight"  # Soft embedding normalization 
         if name == "post_proj_norm.weight":
-            return "mm.post_proj_norm.weight"  # Post projection normalization (CRITICAL for Gemma3n)
+            return "mm.post_proj_norm.weight"  # Post projection normalization (if exists)
 
         # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder
         if name.startswith("vision_tower."):
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 6dd1a3d465d..930da38e302 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -55,7 +55,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         }
     }
 
-    // 3. Multi-Scale Fusion Adapter (MSFA) - REPLICATED & FIXED
+    // 3. Multi-Scale Fusion Adapter (MSFA)
     if (!intermediate_features.empty()) {
         
         // A. Reference Resolution: PyTorch implementation uses inputs[0]
@@ -191,8 +191,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // 4. PROJECTION
     // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
     // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
-    // Need to transpose for ggml_mul_mat which computes A^T * B
-    // This matches Gemma3's projection at line ~1319 which also transposes
     if (model.mm_input_proj_w) {
         cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);         
     }

From 67801e5b62a68db509ef879a0c47b5bf096df785 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sun, 21 Dec 2025 19:13:47 +0000
Subject: [PATCH 06/20] 1. Hard-code image_mean = 0 and image_std = 1 2. Use
 available tensor mapping logic 3. Remove redundant chat template replacement
 of soft tokens placeholder with media placeholder

---
 convert_hf_to_gguf.py          | 113 +++++----------------------------
 gguf-py/gguf/constants.py      |   9 +++
 gguf-py/gguf/tensor_mapping.py |  21 ++++++
 3 files changed, 46 insertions(+), 97 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dd94efe7ed0..55e82fe9128 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5966,9 +5966,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 @ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel")
 class Gemma3nVisionModel(MmprojModel):
     """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
-
-    # MobileNetV5 doesn't have transformer layers, so we don't need block count
-    # Set n_block_keys to empty list to skip the find_hparam check
     n_block_keys = []
 
     def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
@@ -5984,34 +5981,17 @@ def __init__(self, *args, **kwargs):
 
     def find_vparam(self, keys: list[str], optional: bool = False) -> Any:
         """Override to provide hardcoded MobileNetV5 parameters that aren't in config"""
-        # MobileNetV5 hardcodes these values in the architecture definition
-        # rather than storing them in config.json
-
         # Handle empty keys list (n_block_keys) - return 0 for CNN architecture
         if not keys:
             return 0
 
-        # Check if we're looking for image_size
-        if "image_size" in keys:
-            # MobileNetV5 300m_enc uses 768x768 input
-            return 768
-
-        # Check if we're looking for patch_size
-        if "patch_size" in keys:
-            # MobileNetV5 is CNN-based, doesn't use patches
-            # Set to 1 for compatibility
-            return 1
-
-        # Check if we're looking for intermediate_size
         if "intermediate_size" in keys:
-            # MobileNetV5 uses expansion ratios in inverted residual blocks
             # Typical expansion is 4x the embedding dimension
             hidden_size = self.hparams_vision.get("hidden_size", 2048)
             return hidden_size * 4
 
-        # Check if we're looking for num_attention_heads
         if "num_attention_heads" in keys or "num_heads" in keys:
-            # MobileNetV5 uses Multi-Query Attention with 8 heads
+            # Multi-Query Attention with 8 heads
             return 8
 
         # For other parameters, use parent implementation
@@ -6019,41 +5999,25 @@ def find_vparam(self, keys: list[str], optional: bool = False) -> Any:
 
     def set_gguf_parameters(self):
         # MobileNetV5 does not use normalisation at all
-        IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ]
-        IMAGENET_STD = [0.5 , 0.5 , 0.5 ]
-
-        # Check if preprocessor_config has incorrect normalization values
-        if "image_mean" in self.preprocessor_config:
-            current_mean = self.preprocessor_config["image_mean"]
-            if current_mean != IMAGENET_MEAN:
-                logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}")
-                self.preprocessor_config["image_mean"] = IMAGENET_MEAN
-        else:
-            logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}")
-            self.preprocessor_config["image_mean"] = IMAGENET_MEAN
-
-        if "image_std" in self.preprocessor_config:
-            current_std = self.preprocessor_config["image_std"]
-            if current_std != IMAGENET_STD:
-                logger.warning(f"Overriding image_std from {current_std} to ImageNet standard {IMAGENET_STD}")
-                self.preprocessor_config["image_std"] = IMAGENET_STD
-        else:
-            logger.info(f"Setting image_std to ImageNet standard {IMAGENET_STD}")
-            self.preprocessor_config["image_std"] = IMAGENET_STD
+        self.preprocessor_config["image_mean"] = [0.0 , 0.0 , 0.0 ]
+        self.preprocessor_config["image_std"] = [1.0 , 1.0 , 1.0 ]
+        self.hparams_vision["image_size"] = self.preprocessor_config.get(
+            "size", {"height": 768, "width": 768}
+        )["height"]
+
+        # Image sequence length (256 tokens = 16x16 for Gemma3n)
+        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        image_size = self.hparams_vision["image_size"]
+        self.hparams_vision["patch_size"] = image_size // image_seq_length
 
         # Now call parent which will use the corrected values
         super().set_gguf_parameters()
-        hparams = self.hparams
 
         # Set projector type to GEMMA3N
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N)
 
         # MobileNetV5 specific parameters
-        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_vision_use_gelu(True)  # MobileNetV5 uses approximate GELU
-
-        # Image sequence length (256 tokens = 16x16 for Gemma3n)
-        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         # Force quantization settings for specific tensor types
@@ -6090,7 +6054,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         # Handle normalization layer naming
         name = name.replace("hard_embedding_norm", "hard_emb_norm")
         name = name.replace("soft_embedding_norm", "soft_emb_norm")
-        # name = name.replace("embedding_post_projection_norm", "post_proj_norm")
 
         # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed
         # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1
@@ -6098,37 +6061,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # No correction needed for Gemma3n
             pass
 
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def map_tensor_name(self, name: str) -> str:
-        """Map Gemma3n tensor names to GGUF format"""
-        # Projector tensors (from embed_vision) - use mm. prefix like Gemma3
-        # IMPORTANT: Keep the .weight suffix to match ggml expectations
-        if name == "embedding.weight":
-            return "mm.embedding.weight"
-        if name == "embedding_projection.weight":
-            return "mm.input_projection.weight"  # Main projection 
-        if name == "hard_emb_norm.weight":
-            return "mm.hard_emb_norm.weight"  # Hard embedding normalization
-        if name == "soft_emb_norm.weight":
-            return "mm.soft_emb_norm.weight"  # Soft embedding normalization 
-        if name == "post_proj_norm.weight":
-            return "mm.post_proj_norm.weight"  # Post projection normalization (if exists)
-
-        # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder
         if name.startswith("vision_tower."):
-            # Remove vision_tower prefix and add v.enc. prefix
-            tensor_suffix = name[13:]  # Remove "vision_tower."
-            return f"v.enc.{tensor_suffix}"
-
-        # If no match, try parent implementation
-        try:
-            return super().map_tensor_name(name)
-        except ValueError:
-            # If parent also can't map it, provide a sensible default
-            # This shouldn't happen, but provides a fallback
-            logger.warning(f"Using fallback mapping for tensor: {name}")
-            return f"v.{name}"
+            tensor_suffix = name[13:]
+            return [(f"v.enc.{tensor_suffix}", data_torch)]
+        else:
+            return [(self.map_tensor_name(name), data_torch)]
 
 
 @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
@@ -6173,24 +6110,6 @@ def set_vocab(self):
         if vocab_size_per_layer_input is not None:
             self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
 
-        # Fix chat template for Gemma3n multimodal: replace special token placeholders with mtmd markers
-        # The mtmd library uses <__media__> as the default marker for images/audio
-        # but Gemma3n's chat template uses <image_soft_token> and <audio_soft_token>
-        chat_template_key = "tokenizer.chat_template"
-        for kv_dict in self.gguf_writer.kv_data:
-            if chat_template_key in kv_dict:
-                template_value = kv_dict[chat_template_key].value
-
-                # Replace soft token placeholders with mtmd markers
-                if '<image_soft_token>' in template_value or '<audio_soft_token>' in template_value:
-                    logger.info("Fixing Gemma3n chat template: replacing soft token placeholders with mtmd markers")
-                    template_value = template_value.replace('<image_soft_token>', '<__media__>')
-                    template_value = template_value.replace('<audio_soft_token>', '<__media__>')
-
-                    # Update the value in place
-                    kv_dict[chat_template_key].value = template_value
-                break
-
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 41654b22b5d..869a8582b12 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -667,6 +667,9 @@ class MODEL_TENSOR(IntEnum):
     V_MM_INP_NORM        = auto()
     V_MM_INP_PROJ        = auto() # gemma3
     V_MM_SOFT_EMB_NORM   = auto() # gemma3
+    V_MM_EMBEDDING       = auto() # gemma3n
+    V_MM_HARD_EMB_NORM   = auto() # gemma3n
+    V_MM_POST_PROJ_NORM  = auto() # gemma3n
     V_RESMPL_POS_EMBD_K  = auto() # minicpmv
     V_RESMPL_ATTN_Q      = auto() # minicpmv
     V_RESMPL_ATTN_K      = auto() # minicpmv
@@ -1059,6 +1062,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
     MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
     MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
+    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",
+    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",
+    MODEL_TENSOR.V_MM_POST_PROJ_NORM:       "mm.post_proj_norm",
     MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
     MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
     MODEL_TENSOR.V_RESMPL_ATTN_K:           "resampler.attn.k",
@@ -1157,6 +1163,9 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MM_INP_PROJ,
         MODEL_TENSOR.V_MM_INP_NORM,
         MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
+        MODEL_TENSOR.V_MM_EMBEDDING,
+        MODEL_TENSOR.V_MM_HARD_EMB_NORM,
+        MODEL_TENSOR.V_MM_POST_PROJ_NORM,
         MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
         MODEL_TENSOR.V_RESMPL_ATTN_Q,
         MODEL_TENSOR.V_RESMPL_ATTN_K,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 301aafa9102..3e1cf8a136f 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -119,6 +119,27 @@ class TensorNameMap:
         MODEL_TENSOR.CONV1D: (
             "backbone.embed", # roberta
         ),
+
+        # Vision multimodal projector tensors (non-block) for gemma3n
+        MODEL_TENSOR.V_MM_INP_PROJ: (
+            "embedding_projection", # gemma3n
+        ),
+
+        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
+            "soft_emb_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.V_MM_EMBEDDING: (
+            "embedding", # gemma3n
+        ),
+
+        MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
+            "hard_emb_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.V_MM_POST_PROJ_NORM: (
+            "post_proj_norm", # gemma3n
+        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {

From 04947c7f9e355914e7e4d6cdf79c28193218d988 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sun, 21 Dec 2025 19:19:47 +0000
Subject: [PATCH 07/20] 1. Move mobilenetv5 helpers declarations to
 `clip_graph_mobilenetv5` struct and definitions to mobilenetv5.cpp 2.Remove
 unused `clip_is_gemma3n` func declarations and definitions 3. Remove
 redundant `rescale_image_u8_to_f32` func and use `normalize_image_u8_to_f32`
 with zero mean and unit std 4. Calculate n_patches using image_size /
 patch_size

---
 tools/mtmd/clip-graph.h           |  28 ---
 tools/mtmd/clip.cpp               | 271 +-----------------------------
 tools/mtmd/clip.h                 |   1 -
 tools/mtmd/models/mobilenetv5.cpp | 249 +++++++++++++++++++++++++++
 tools/mtmd/models/models.h        |  28 +++
 5 files changed, 279 insertions(+), 298 deletions(-)

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
index 6a9efb933e5..2b1915779f2 100644
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -70,34 +70,6 @@ struct clip_graph {
 
     ggml_tensor * build_inp_raw(int channels = 3);
 
-    ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp, 
-        ggml_tensor * weight, 
-        float eps = 1e-6f);
-    
-    ggml_tensor* pad_same_2d(
-        ggml_tensor* inp, 
-        int kernel_h, 
-        int kernel_w, 
-        int stride_h, 
-        int stride_w, 
-        int dilation_h = 1, 
-        int dilation_w = 1);
-        
-    ggml_tensor * build_edge_residual(
-        ggml_tensor * inp,
-        const mobilenetv5_block & block,
-        int stride);
-
-    ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp, 
-        const mobilenetv5_block & block, 
-        int stride);
-
-    ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp, 
-        const mobilenetv5_block & block);
-
     ggml_tensor * build_norm(
             ggml_tensor * cur,
             ggml_tensor * mw,
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9e4519c502b..e86a09bb5c1 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -263,255 +263,6 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const {
     }
 }
 
-// --- Helpers for MobileNetV5 Blocks ---
-// RMS Norm 2D - normalizes over channels for each spatial position
-ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
-    // inp: [W, H, C, B]
-
-    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-    cur = ggml_rms_norm(ctx0, cur, eps);
- 
-    if (weight) {
-        cur = ggml_mul(ctx0, cur, weight);
-    }
-
-    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
-    return cur;
-}
-
-// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF)
-ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
-    const int64_t ih = inp->ne[1];  // height
-    const int64_t iw = inp->ne[0];  // width
-
-    // Calculate output size (ceil division)
-    const int64_t oh = (ih + stride_h - 1) / stride_h;
-    const int64_t ow = (iw + stride_w - 1) / stride_w;
-
-    // Calculate padding needed
-    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
-    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
-
-    // Split padding asymmetrically
-    const int pad_h_top = pad_h / 2;
-    const int pad_h_bottom = pad_h - pad_h_top;
-    const int pad_w_left = pad_w / 2;
-    const int pad_w_right = pad_w - pad_w_left;
-
-    // Apply padding if needed
-    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
-    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
-    if (pad_h > 0 || pad_w > 0) {
-        inp = ggml_pad_ext(ctx0, inp,
-            pad_w_left, pad_w_right,     // width padding (dim 0)
-            pad_h_top, pad_h_bottom,      // height padding (dim 1)
-            0, 0,                         // no channel padding (dim 2)
-            0, 0);                        // no batch padding (dim 3)
-    }
-
-    return inp;
-}
-
-
-// Edge Residual Block (Stage 0)
-ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Expansion Conv (3x3)
-    if (stride == 2) {
-        // Case: Downsampling (Block 0)
-        // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride); 
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
-    } else {
-        // Case: Normal 3x3 Block (Block 1, 2)
-        // Replicates Conv2d(kernel=3, stride=1, padding=1)
-        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
-    }
-
-    // BN + Activation
-    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
-    cur = ggml_gelu(ctx0, cur);
-
-    // 2. Pointwise Linear Conv (1x1)
-    // 1x1 Convs usually have padding=0 and stride=1
-    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
-    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
-
-    // 3. Residual Connection
-    // Only apply residual if spatial dimensions and channels match (stride 1)
-    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
-    ggml_tensor * cur = inp;
-
-    // 1. Depthwise Start (Optional)
-    // NOTE: dw_start always has stride=1 (no downsampling here)
-    if (block.dw_start_w) {
-        int k = block.dw_start_w->ne[0]; // 3 or 5
-        int p = k / 2;
-        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
-        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
-    }
-
-    // 2. Pointwise Expansion (1x1)
-    if (block.pw_exp_w) {
-        // Standard 1x1 conv, pad=0, stride=1
-        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 3. Depthwise Mid (Optional)
-    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
-    if (block.dw_mid_w) {
-        int k = block.dw_mid_w->ne[0]; // 3 or 5
-        
-        if (stride > 1) {
-            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
-            cur = pad_same_2d(cur, k, k, stride, stride);
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
-        } else {
-            // Case: Stride 1 -> Use Standard Symmetric Padding
-            int p = k / 2;
-            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
-        }
-
-        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
-        cur = ggml_gelu(ctx0, cur);
-    }
-
-    // 4. Pointwise Projection (1x1)
-    if (block.pw_proj_w) {
-        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
-        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
-    }
-
-    // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
-            1, 1, block.layer_scale_w->ne[0], 1);
-        
-        cur = ggml_mul(ctx0, cur, scale_w_reshaped);
-    }
-
-    // 5. Residual Connection
-    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
-    bool same_channel = (inp->ne[2] == cur->ne[2]);
-    if (same_spatial && same_channel) {
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
-// MobileNetV5 Builder (Gemma 3n) - Attention Block
-ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
-    ggml_tensor * cur = inp;
-
-    // --- Norm ---
-    if (block.attn_norm_w) {
-        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
-    }
-
-    // --- 1. Q Calculation ---
-    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
-
-    // --- 2. K Calculation (Downsampled) ---
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * k_inp = cur;
-    if (block.attn_k_dw_w) {
-        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
-        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
-        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_k_norm_w) {
-            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
-
-    // --- 3. V Calculation (Downsampled) ---
-    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
-    ggml_tensor * v_inp = cur;
-    if (block.attn_v_dw_w) {
-        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
-        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
-        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
-        if (block.attn_v_norm_w) {
-            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
-        }
-    }
-    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
-
-    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
-    const int D = k->ne[2]; // Head dimension
-    const int n_head = q->ne[2] / D;
-    const int N = W * H;
-
-    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
-    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
-    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
-    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
-    q = ggml_cont(ctx0, q);
-
-    const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk; 
-
-    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
-    k = ggml_reshape_3d(ctx0, k, M, D, B);
-    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
-    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
-    k = ggml_cont(ctx0, k);
-
-    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
-    v = ggml_reshape_3d(ctx0, v, M, D, B);
-    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
-    v = ggml_cont(ctx0, v); // [M, D, 1, B]
-
-    // --- Multi-Query Attention ---
-    float scale = 1.0f / sqrtf((float)D);
-
-    // Step 1: Compute Q @ K.T
-    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
-
-    scores = ggml_scale(ctx0, scores, scale);
-
-    scores = ggml_soft_max(ctx0, scores);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
-
-    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
-    kqv = ggml_cont(ctx0, kqv);
-    
-
-    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
-    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
-    kqv = ggml_cont(ctx0, kqv);
-
-    // Output projection
-    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
-
-    // --- Residual & Layer Scale (FIXED) ---
-    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
-        if (block.layer_scale_w) {
-            ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
-                1, 1, block.layer_scale_w->ne[0], 1);
-            cur = ggml_mul(ctx0, cur, scale_w_reshaped);
-        }
-        cur = ggml_add(ctx0, cur, inp);
-    }
-
-    return cur;
-}
-
 // siglip2 naflex
 ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
     ggml_tensor * pos_embd = model.position_embeddings;
@@ -2414,18 +2165,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
-// Rescale image from u8 to f32 without normalization (for models like GEMMA3N that use SiglipImageProcessorFast)
-// This only converts from [0, 255] to [0.0, 1.0] range without applying mean/std normalization
-static void rescale_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
- 
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<float>(src.buf[i]) / 255.0f;
-    }
-}
-
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
     dst.nx = src.nx;
@@ -3123,13 +2862,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
 
         case PROJECTOR_TYPE_GEMMA3N:
             {
-                // GEMMA3N uses SiglipImageProcessorFast which only rescales to [0.0, 1.0] without normalization
-                // Resize to 768x768 using bilinear interpolation, then rescale to f32
                 clip_image_u8 resized_image;
                 int sz = params.image_size;
                 img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
                 clip_image_f32_ptr img_f32(clip_image_f32_init());
-                rescale_image_u8_to_f32(resized_image, *img_f32);
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(img_f32));
             } break;
 
@@ -3396,7 +3133,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             {
                 // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
                 // regardless of input size (see architecture description)
-                n_patches = 16 * 16;  // 256 tokens
+                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
             } break;
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
@@ -3969,10 +3706,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 
-bool clip_is_gemma3n(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3N;
-}
-
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_VISION;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index c244df2677f..68a0d6e857e 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -107,7 +107,6 @@ bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
-bool clip_is_gemma3n(const struct clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 930da38e302..bc1185c10eb 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -1,5 +1,254 @@
 #include "models.h"
 
+// --- Helpers for MobileNetV5 Blocks ---
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+    // inp: [W, H, C, B]
+
+    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_rms_norm(ctx0, cur, eps);
+ 
+    if (weight) {
+        cur = ggml_mul(ctx0, cur, weight);
+    }
+
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    return cur;
+}
+
+// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF)
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+    const int64_t ih = inp->ne[1];  // height
+    const int64_t iw = inp->ne[0];  // width
+
+    // Calculate output size (ceil division)
+    const int64_t oh = (ih + stride_h - 1) / stride_h;
+    const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+    // Calculate padding needed
+    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+    // Split padding asymmetrically
+    const int pad_h_top = pad_h / 2;
+    const int pad_h_bottom = pad_h - pad_h_top;
+    const int pad_w_left = pad_w / 2;
+    const int pad_w_right = pad_w - pad_w_left;
+
+    // Apply padding if needed
+    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+    if (pad_h > 0 || pad_w > 0) {
+        inp = ggml_pad_ext(ctx0, inp,
+            pad_w_left, pad_w_right,     // width padding (dim 0)
+            pad_h_top, pad_h_bottom,      // height padding (dim 1)
+            0, 0,                         // no channel padding (dim 2)
+            0, 0);                        // no batch padding (dim 3)
+    }
+
+    return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Expansion Conv (3x3)
+    if (stride == 2) {
+        // Case: Downsampling (Block 0)
+        // Replicates Conv2dSame(kernel=3, stride=2)
+        cur = pad_same_2d(cur, 3, 3, stride, stride); 
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+    } else {
+        // Case: Normal 3x3 Block (Block 1, 2)
+        // Replicates Conv2d(kernel=3, stride=1, padding=1)
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+    }
+
+    // BN + Activation
+    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+    cur = ggml_gelu(ctx0, cur);
+
+    // 2. Pointwise Linear Conv (1x1)
+    // 1x1 Convs usually have padding=0 and stride=1
+    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+    // 3. Residual Connection
+    // Only apply residual if spatial dimensions and channels match (stride 1)
+    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Depthwise Start (Optional)
+    // NOTE: dw_start always has stride=1 (no downsampling here)
+    if (block.dw_start_w) {
+        int k = block.dw_start_w->ne[0]; // 3 or 5
+        int p = k / 2;
+        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+    }
+
+    // 2. Pointwise Expansion (1x1)
+    if (block.pw_exp_w) {
+        // Standard 1x1 conv, pad=0, stride=1
+        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 3. Depthwise Mid (Optional)
+    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+    if (block.dw_mid_w) {
+        int k = block.dw_mid_w->ne[0]; // 3 or 5
+        
+        if (stride > 1) {
+            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+            cur = pad_same_2d(cur, k, k, stride, stride);
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+        } else {
+            // Case: Stride 1 -> Use Standard Symmetric Padding
+            int p = k / 2;
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+        }
+
+        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 4. Pointwise Projection (1x1)
+    if (block.pw_proj_w) {
+        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+    }
+
+    // Apply Layer Scaling if present
+    if (block.layer_scale_w) {
+        ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
+            1, 1, block.layer_scale_w->ne[0], 1);
+        
+        cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+    }
+
+    // 5. Residual Connection
+    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+    bool same_channel = (inp->ne[2] == cur->ne[2]);
+    if (same_spatial && same_channel) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// MobileNetV5 Builder (Gemma 3n) - Attention Block
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+    ggml_tensor * cur = inp;
+
+    // --- Norm ---
+    if (block.attn_norm_w) {
+        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+    }
+
+    // --- 1. Q Calculation ---
+    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+    // --- 2. K Calculation (Downsampled) ---
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * k_inp = cur;
+    if (block.attn_k_dw_w) {
+        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
+        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
+        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_k_norm_w) {
+            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+    // --- 3. V Calculation (Downsampled) ---
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * v_inp = cur;
+    if (block.attn_v_dw_w) {
+        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
+        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
+        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_v_norm_w) {
+            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+    const int D = k->ne[2]; // Head dimension
+    const int n_head = q->ne[2] / D;
+    const int N = W * H;
+
+    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+    q = ggml_cont(ctx0, q);
+
+    const int Wk = k->ne[0]; const int Hk = k->ne[1];
+    const int M = Wk * Hk; 
+
+    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+    k = ggml_reshape_3d(ctx0, k, M, D, B);
+    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+    k = ggml_cont(ctx0, k);
+
+    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+    v = ggml_reshape_3d(ctx0, v, M, D, B);
+    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+    v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+    // --- Multi-Query Attention ---
+    float scale = 1.0f / sqrtf((float)D);
+
+    // Step 1: Compute Q @ K.T
+    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+    scores = ggml_scale(ctx0, scores, scale);
+
+    scores = ggml_soft_max(ctx0, scores);
+
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+    kqv = ggml_cont(ctx0, kqv);
+    
+
+    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+    kqv = ggml_cont(ctx0, kqv);
+
+    // Output projection
+    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+    // --- Residual & Layer Scale (FIXED) ---
+    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+        if (block.layer_scale_w) {
+            ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
+                1, 1, block.layer_scale_w->ne[0], 1);
+            cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+        }
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
 ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     fprintf(stderr, "\n--- START build_mobilenetv5 ---\n");
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 3875285fe92..54664d10ce3 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -70,4 +70,32 @@ struct clip_graph_glm4v : clip_graph {
 struct clip_graph_mobilenetv5 : clip_graph {
     clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
+
+    ggml_tensor * rms_norm_2d(
+        ggml_tensor * inp, 
+        ggml_tensor * weight, 
+        float eps = 1e-6f);
+    
+    ggml_tensor* pad_same_2d(
+        ggml_tensor* inp, 
+        int kernel_h, 
+        int kernel_w, 
+        int stride_h, 
+        int stride_w, 
+        int dilation_h = 1, 
+        int dilation_w = 1);
+        
+    ggml_tensor * build_edge_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride);
+
+    ggml_tensor * build_inverted_residual(
+        ggml_tensor * inp, 
+        const mobilenetv5_block & block, 
+        int stride);
+
+    ggml_tensor * build_mobilenet_attn(
+        ggml_tensor * inp, 
+        const mobilenetv5_block & block);
 };

From 86618c7c0a0a3aff2aa12294fb17b2ad15610c29 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Mon, 22 Dec 2025 13:45:24 +0000
Subject: [PATCH 08/20] Remove obsolete comments

---
 tools/mtmd/clip.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index e86a09bb5c1..dd778ea3c96 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1536,8 +1536,6 @@ struct clip_model_loader {
                     model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
                     model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
                     
-                    // IMPORTANT: Your GGUF log shows 'v.enc.msfa.norm.weight' -> shape {2048}
-                    // Ensure TN_MNV5_MSFA_NORM matches this string
                     model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
 
                     // Dynamically load blocks stage by stage
@@ -1620,8 +1618,6 @@ struct clip_model_loader {
                     // Load projection weights (similar to Gemma3)
                     model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                     model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                    // model.mm_post_proj_norm_w = get_tensor(TN_MM_POST_PROJ_N);  // CRITICAL: Post projection norm
-                    // Load additional Gemma3n projection tensors
                     model.mm_0_w = get_tensor("mm.embedding.weight", false);  // Input embedding
                     model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false);  // Hard embedding norm
                 } break;

From e2835e9fbe0870dfb642f54295256515a3fd5471 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Fri, 26 Dec 2025 19:41:05 +0000
Subject: [PATCH 09/20] - convert_hf_to_gguf.py & constants.py &
 tensor_mapping.py: Use explicit mapping: Custom map for double indexed blocks
 and tensor_mapping.py for rest - convert_hf_to_gguf.py: Unsqueeze Stem Bias
 and Layer scale tensors to correct shape while converting to gguf -
 mobilenetv5.cpp: Remove explicit reshaping of Stem Bias and Layer scale which
 are now handled while converting to gguf, replace fprintf with LOG_* -
 clip.cpp: Remove unused embedding and hard_emb_norm tensor loading

---
 convert_hf_to_gguf.py             | 78 ++++++++++++++++++++-----------
 gguf-py/gguf/constants.py         | 33 ++++++++++---
 gguf-py/gguf/tensor_mapping.py    | 42 +++++++++++------
 tools/mtmd/clip.cpp               |  3 --
 tools/mtmd/models/mobilenetv5.cpp | 42 +++++++----------
 5 files changed, 122 insertions(+), 76 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 55e82fe9128..abd65101b52 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5968,6 +5968,32 @@ class Gemma3nVisionModel(MmprojModel):
     """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
     n_block_keys = []
 
+    # Double indexed mapping for MobileNetV5 blocks
+    block_tensor_mapping = {
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.enc.blocks.{bid}.{sid}.conv_exp.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.enc.blocks.{bid}.{sid}.bn1.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.enc.blocks.{bid}.{sid}.conv_pwl.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.enc.blocks.{bid}.{sid}.bn2.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.enc.blocks.{bid}.{sid}.dw_start.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.enc.blocks.{bid}.{sid}.dw_start.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.enc.blocks.{bid}.{sid}.dw_mid.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.enc.blocks.{bid}.{sid}.dw_mid.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.enc.blocks.{bid}.{sid}.pw_exp.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.enc.blocks.{bid}.{sid}.pw_exp.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.enc.blocks.{bid}.{sid}.pw_proj.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.enc.blocks.{bid}.{sid}.pw_proj.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.enc.blocks.{bid}.{sid}.layer_scale.gamma",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.enc.blocks.{bid}.{sid}.attn.query.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.enc.blocks.{bid}.{sid}.attn.key.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.enc.blocks.{bid}.{sid}.attn.value.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.enc.blocks.{bid}.{sid}.attn.output.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.enc.blocks.{bid}.{sid}.attn.key.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.enc.blocks.{bid}.{sid}.attn.key.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.value.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.enc.blocks.{bid}.{sid}.attn.value.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.enc.blocks.{bid}.{sid}.norm.weight",
+    }
+
     def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
         """Override to return 0 for block count since MobileNetV5 is CNN-based"""
         if not keys:  # If n_block_keys is empty (our case)
@@ -6027,10 +6053,23 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
             return gguf.GGMLQuantizationType.F32
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
+    def custom_map(self, name: str) -> str:
+        """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
+        parts = name.split(".")
+        # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
+        if len(parts) >= 7:
+            bid, sid = parts[4], parts[5]
+            suffix = ".".join(parts[6:])
+            template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
+            if template in self.block_tensor_mapping:
+                return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
+
+        raise ValueError(f"Unknown name: {name}")
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
-        # Gemma3n uses different prefixes than other models:
+        # Gemma3n uses
         # - model.embed_vision.* for projection layers
         # - model.vision_tower.* for vision encoder
         # Skip non-vision tensors
@@ -6038,34 +6077,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 name.startswith("model.vision_tower.")):
             return []
 
-        # Strip "model." prefix to match expected llama.cpp format
-        if name.startswith("model."):
-            name = name[6:]  # Remove "model." prefix
-
-        # Process MobileNetV5 and projection tensors
-        name = name.replace("_weight", ".weight")
-
-        # Rename embed_vision to match our C++ implementation expectations
-        name = name.replace("embed_vision.", "")
-
-        # Rename vision_tower.timm_model to vision_tower for cleaner naming
-        name = name.replace("vision_tower.timm_model.", "vision_tower.")
-
-        # Handle normalization layer naming
-        name = name.replace("hard_embedding_norm", "hard_emb_norm")
-        name = name.replace("soft_embedding_norm", "soft_emb_norm")
-
-        # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed
-        # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1
-        if "soft_emb_norm.weight" in name:
-            # No correction needed for Gemma3n
-            pass
-
-        if name.startswith("vision_tower."):
-            tensor_suffix = name[13:]
-            return [(f"v.enc.{tensor_suffix}", data_torch)]
+        if name.startswith("model.vision_tower.timm_model.blocks."):
+            # Double-indexed block tensors through custom logic
+            new_name = self.custom_map(name)
         else:
-            return [(self.map_tensor_name(name), data_torch)]
+            # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
+            new_name = self.map_tensor_name(name)
+        
+        if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
+            data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
+
+        yield (new_name, data_torch)
 
 
 @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 869a8582b12..975a99a61a0 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -669,7 +669,14 @@ class MODEL_TENSOR(IntEnum):
     V_MM_SOFT_EMB_NORM   = auto() # gemma3
     V_MM_EMBEDDING       = auto() # gemma3n
     V_MM_HARD_EMB_NORM   = auto() # gemma3n
-    V_MM_POST_PROJ_NORM  = auto() # gemma3n
+    V_ENC_CONV_STEM       = auto() # gemma3n
+    V_ENC_CONV_STEM_BIAS  = auto() # gemma3n
+    V_ENC_CONV_STEM_NORM  = auto() # gemma3n
+    V_ENC_MSFA_EXP        = auto() # gemma3n
+    V_ENC_MSFA_EXP_NORM   = auto() # gemma3n
+    V_ENC_MSFA_PROJ       = auto() # gemma3n
+    V_ENC_MSFA_PROJ_NORM  = auto() # gemma3n
+    V_ENC_MSFA_NORM       = auto() # gemma3n
     V_RESMPL_POS_EMBD_K  = auto() # minicpmv
     V_RESMPL_ATTN_Q      = auto() # minicpmv
     V_RESMPL_ATTN_K      = auto() # minicpmv
@@ -1061,10 +1068,17 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_POST_NORM:            "mm.post_norm",
     MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
     MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
-    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
-    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",
-    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",
-    MODEL_TENSOR.V_MM_POST_PROJ_NORM:       "mm.post_proj_norm",
+    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",             # gemma3n
+    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",                 # gemma3n
+    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",             # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM:           "v.enc.conv_stem.conv",         # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.enc.conv_stem.conv_bias",    # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.enc.conv_stem.bn",           # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.enc.msfa.ffn.pw_exp.conv",   # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.enc.msfa.ffn.pw_exp.bn",     # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ:           "v.enc.msfa.ffn.pw_proj.conv",  # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM:      "v.enc.msfa.ffn.pw_proj.bn",    # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_NORM:           "v.enc.msfa.norm",              # gemma3n
     MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
     MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
     MODEL_TENSOR.V_RESMPL_ATTN_K:           "resampler.attn.k",
@@ -1165,7 +1179,14 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
         MODEL_TENSOR.V_MM_EMBEDDING,
         MODEL_TENSOR.V_MM_HARD_EMB_NORM,
-        MODEL_TENSOR.V_MM_POST_PROJ_NORM,
+        MODEL_TENSOR.V_ENC_CONV_STEM,
+        MODEL_TENSOR.V_ENC_CONV_STEM_BIAS,
+        MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_EXP,
+        MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_PROJ,
+        MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM,
+        MODEL_TENSOR.V_ENC_MSFA_NORM,
         MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
         MODEL_TENSOR.V_RESMPL_ATTN_Q,
         MODEL_TENSOR.V_RESMPL_ATTN_K,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 3e1cf8a136f..9b17cb1ef7d 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -120,25 +120,41 @@ class TensorNameMap:
             "backbone.embed", # roberta
         ),
 
-        # Vision multimodal projector tensors (non-block) for gemma3n
+        MODEL_TENSOR.V_MM_EMBEDDING: (
+            "model.embed_vision.embedding", # gemma3n
+        ),
+        MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
+            "model.embed_vision.hard_embedding_norm", # gemma3n
+        ),
         MODEL_TENSOR.V_MM_INP_PROJ: (
-            "embedding_projection", # gemma3n
+            "model.embed_vision.embedding_projection", # gemma3n
         ),
-
         MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
-            "soft_emb_norm", # gemma3n
+            "model.embed_vision.soft_embedding_norm", # gemma3n
         ),
-
-        MODEL_TENSOR.V_MM_EMBEDDING: (
-            "embedding", # gemma3n
+        MODEL_TENSOR.V_ENC_CONV_STEM: (
+            "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
         ),
-
-        MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
-            "hard_emb_norm", # gemma3n
+        MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: (
+            "model.vision_tower.timm_model.conv_stem.conv.bias", # gemma3n
         ),
-
-        MODEL_TENSOR.V_MM_POST_PROJ_NORM: (
-            "post_proj_norm", # gemma3n
+        MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
+            "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_EXP: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_PROJ: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
+            "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
+        ),
+        MODEL_TENSOR.V_ENC_MSFA_NORM: (
+            "model.vision_tower.timm_model.msfa.norm", # gemma3n
         ),
     }
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index dd778ea3c96..d54b893b61f 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1615,11 +1615,8 @@ struct clip_model_loader {
                             LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
                         }
                     }
-                    // Load projection weights (similar to Gemma3)
                     model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                     model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
-                    model.mm_0_w = get_tensor("mm.embedding.weight", false);  // Input embedding
-                    model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false);  // Hard embedding norm
                 } break;
             case PROJECTOR_TYPE_IDEFICS3:
                 {
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index bc1185c10eb..1a5c61fb581 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -1,6 +1,6 @@
 #include "models.h"
 
-// --- Helpers for MobileNetV5 Blocks ---
+// Helpers for MobileNetV5 Blocks
 // RMS Norm 2D - normalizes over channels for each spatial position
 ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
     // inp: [W, H, C, B]
@@ -19,7 +19,7 @@ ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor
     return cur;
 }
 
-// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF)
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
 ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
     const int64_t ih = inp->ne[1];  // height
     const int64_t iw = inp->ne[0];  // width
@@ -87,6 +87,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, con
     return cur;
 }
 
+// Universal Inverted Residual Block (Stage 1+)
 ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
     ggml_tensor * cur = inp;
 
@@ -133,11 +134,8 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp,
     }
 
     // Apply Layer Scaling if present
-    if (block.layer_scale_w) {
-        ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
-            1, 1, block.layer_scale_w->ne[0], 1);
-        
-        cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+    if (block.layer_scale_w) {        
+        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
     }
 
     // 5. Residual Connection
@@ -150,19 +148,19 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp,
     return cur;
 }
 
-// MobileNetV5 Builder (Gemma 3n) - Attention Block
+// Attention Block (MQA) 
 ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
     ggml_tensor * cur = inp;
 
-    // --- Norm ---
+    // Norm
     if (block.attn_norm_w) {
         cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
     }
 
-    // --- 1. Q Calculation ---
+    // 1. Q Calculation
     ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
 
-    // --- 2. K Calculation (Downsampled) ---
+    // 2. K Calculation (Downsampled)
     // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
     ggml_tensor * k_inp = cur;
     if (block.attn_k_dw_w) {
@@ -175,7 +173,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
     }
     ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
 
-    // --- 3. V Calculation (Downsampled) ---
+    // 3. V Calculation (Downsampled)
     // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
     ggml_tensor * v_inp = cur;
     if (block.attn_v_dw_w) {
@@ -213,7 +211,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
     v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
     v = ggml_cont(ctx0, v); // [M, D, 1, B]
 
-    // --- Multi-Query Attention ---
+    // Multi-Query Attention
     float scale = 1.0f / sqrtf((float)D);
 
     // Step 1: Compute Q @ K.T
@@ -236,12 +234,10 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
     // Output projection
     cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
 
-    // --- Residual & Layer Scale (FIXED) ---
+    // Residual & Layer Scale
     if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
         if (block.layer_scale_w) {
-            ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w,
-                1, 1, block.layer_scale_w->ne[0], 1);
-            cur = ggml_mul(ctx0, cur, scale_w_reshaped);
+            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
         }
         cur = ggml_add(ctx0, cur, inp);
     }
@@ -250,9 +246,6 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
 }
 
 ggml_cgraph * clip_graph_mobilenetv5::build() {
-
-    fprintf(stderr, "\n--- START build_mobilenetv5 ---\n");
-
     ggml_tensor * inp = build_inp_raw();
 
     // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
@@ -260,9 +253,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
     if (model.mobilenet_stem_conv_b) {
-        // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B]
-        ggml_tensor * bias = ggml_reshape_4d(ctx0, model.mobilenet_stem_conv_b, 1, 1, cur->ne[2], 1);
-        cur = ggml_add(ctx0, cur, bias);
+        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
     }
     if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
     cur = ggml_gelu(ctx0, cur);
@@ -332,7 +323,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
                 
                 // Safety check for non-integer scaling if strictly replicating
                 if (high_res_w % feat_w != 0) { 
-                    fprintf(stderr, "Warning: Non-integer scaling detected in MSFA\n"); 
+                    LOG_WRN("%s: non-integer scaling detected\n", __func__);
                 }
 
                 // Upsample (Nearest Neighbor)
@@ -388,7 +379,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
                 // Avg Pool: Kernel=s, Stride=s
                 cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
             } else {
-                fprintf(stderr, "Error: Irregular downsampling stride required.\n");
+                LOG_ERR("%s: irregular downsampling stride required\n", __func__);
             }
 
         }
@@ -418,7 +409,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     // 2. FEATURE SCALING
     // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
-    // This prevents the signal from vanishing during the subsequent RMSNorm.
     const float scale_factor = sqrtf((float)C);
     cur = ggml_scale(ctx0, cur, scale_factor);
 

From 632e29f55152b7e623e0a5016c8391ce2696bad3 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Fri, 26 Dec 2025 20:02:21 +0000
Subject: [PATCH 10/20] - Rename tensors to v.conv..., v.blk..., v.msfa... to
 better align with already existing terminology

---
 convert_hf_to_gguf.py     | 44 ++++++++++++++--------------
 gguf-py/gguf/constants.py | 16 +++++------
 tools/mtmd/clip-impl.h    | 60 +++++++++++++++++++--------------------
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index abd65101b52..d5d52b8bf17 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5970,28 +5970,28 @@ class Gemma3nVisionModel(MmprojModel):
 
     # Double indexed mapping for MobileNetV5 blocks
     block_tensor_mapping = {
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.enc.blocks.{bid}.{sid}.conv_exp.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.enc.blocks.{bid}.{sid}.bn1.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.enc.blocks.{bid}.{sid}.conv_pwl.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.enc.blocks.{bid}.{sid}.bn2.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.enc.blocks.{bid}.{sid}.dw_start.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.enc.blocks.{bid}.{sid}.dw_start.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.enc.blocks.{bid}.{sid}.dw_mid.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.enc.blocks.{bid}.{sid}.dw_mid.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.enc.blocks.{bid}.{sid}.pw_exp.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.enc.blocks.{bid}.{sid}.pw_exp.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.enc.blocks.{bid}.{sid}.pw_proj.conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.enc.blocks.{bid}.{sid}.pw_proj.bn.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.enc.blocks.{bid}.{sid}.layer_scale.gamma",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.enc.blocks.{bid}.{sid}.attn.query.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.enc.blocks.{bid}.{sid}.attn.key.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.enc.blocks.{bid}.{sid}.attn.value.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.enc.blocks.{bid}.{sid}.attn.output.proj.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.enc.blocks.{bid}.{sid}.attn.key.down_conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.enc.blocks.{bid}.{sid}.attn.key.norm.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.value.down_conv.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.enc.blocks.{bid}.{sid}.attn.value.norm.weight",
-        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.enc.blocks.{bid}.{sid}.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.blk.{bid}.{sid}.conv_exp.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.blk.{bid}.{sid}.bn1.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight":             "v.blk.{bid}.{sid}.conv_pwl.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight":                  "v.blk.{bid}.{sid}.bn2.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight":        "v.blk.{bid}.{sid}.dw_start.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight":          "v.blk.{bid}.{sid}.dw_start.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight":          "v.blk.{bid}.{sid}.dw_mid.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight":            "v.blk.{bid}.{sid}.dw_mid.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight":          "v.blk.{bid}.{sid}.pw_exp.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight":            "v.blk.{bid}.{sid}.pw_exp.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight":         "v.blk.{bid}.{sid}.pw_proj.conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight":           "v.blk.{bid}.{sid}.pw_proj.bn.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma":           "v.blk.{bid}.{sid}.layer_scale.gamma",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight":      "v.blk.{bid}.{sid}.attn.query.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight":        "v.blk.{bid}.{sid}.attn.key.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight":      "v.blk.{bid}.{sid}.attn.value.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight":     "v.blk.{bid}.{sid}.attn.output.proj.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight":   "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight":        "v.blk.{bid}.{sid}.attn.key.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight":      "v.blk.{bid}.{sid}.attn.value.norm.weight",
+        "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
     }
 
     def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 975a99a61a0..40bb79400b0 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1071,14 +1071,14 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",             # gemma3n
     MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",                 # gemma3n
     MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",             # gemma3n
-    MODEL_TENSOR.V_ENC_CONV_STEM:           "v.enc.conv_stem.conv",         # gemma3n
-    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.enc.conv_stem.conv_bias",    # gemma3n
-    MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.enc.conv_stem.bn",           # gemma3n
-    MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.enc.msfa.ffn.pw_exp.conv",   # gemma3n
-    MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.enc.msfa.ffn.pw_exp.bn",     # gemma3n
-    MODEL_TENSOR.V_ENC_MSFA_PROJ:           "v.enc.msfa.ffn.pw_proj.conv",  # gemma3n
-    MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM:      "v.enc.msfa.ffn.pw_proj.bn",    # gemma3n
-    MODEL_TENSOR.V_ENC_MSFA_NORM:           "v.enc.msfa.norm",              # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM:           "v.conv_stem.conv",         # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.conv_stem.conv_bias",    # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.conv_stem.bn",           # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.msfa.ffn.pw_exp.conv",   # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.msfa.ffn.pw_exp.bn",     # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ:           "v.msfa.ffn.pw_proj.conv",  # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM:      "v.msfa.ffn.pw_proj.bn",    # gemma3n
+    MODEL_TENSOR.V_ENC_MSFA_NORM:           "v.msfa.norm",              # gemma3n
     MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
     MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
     MODEL_TENSOR.V_RESMPL_ATTN_K:           "resampler.attn.k",
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 24a1ef52d08..ce312711e5e 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -154,44 +154,44 @@
 #define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
 
 // mobilenetv5 (gemma3n) definitions
-#define TN_MNV5_STEM_CONV        "v.enc.conv_stem.conv.weight"
-#define TN_MNV5_STEM_BIAS        "v.enc.conv_stem.conv.bias"
-#define TN_MNV5_STEM_BN          "v.enc.conv_stem.bn.weight"
+#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
 
 // Stage 0 Block (Edge Residual)
-#define TN_MNV5_BLK_S0_EXP_W     "v.enc.blocks.%d.%d.conv_exp.weight"
-#define TN_MNV5_BLK_S0_BN1_W     "v.enc.blocks.%d.%d.bn1.weight"
-#define TN_MNV5_BLK_S0_PWL_W     "v.enc.blocks.%d.%d.conv_pwl.weight"
-#define TN_MNV5_BLK_S0_BN2_W     "v.enc.blocks.%d.%d.bn2.weight"
+#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
 
 // Stage 1+ Block (Universal Inverted Residual)
-#define TN_MNV5_BLK_DW_START_W   "v.enc.blocks.%d.%d.dw_start.conv.weight"
-#define TN_MNV5_BLK_DW_START_BN  "v.enc.blocks.%d.%d.dw_start.bn.weight"
-#define TN_MNV5_BLK_DW_MID_W     "v.enc.blocks.%d.%d.dw_mid.conv.weight"
-#define TN_MNV5_BLK_DW_MID_BN    "v.enc.blocks.%d.%d.dw_mid.bn.weight"
-#define TN_MNV5_BLK_PW_EXP_W     "v.enc.blocks.%d.%d.pw_exp.conv.weight"
-#define TN_MNV5_BLK_PW_EXP_BN    "v.enc.blocks.%d.%d.pw_exp.bn.weight"
-#define TN_MNV5_BLK_PW_PROJ_W    "v.enc.blocks.%d.%d.pw_proj.conv.weight"
-#define TN_MNV5_BLK_PW_PROJ_BN   "v.enc.blocks.%d.%d.pw_proj.bn.weight"
-#define TN_MNV5_BLK_LAYER_SCALE  "v.enc.blocks.%d.%d.layer_scale.gamma"
+#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
 
 // Attention Components
-#define TN_MNV5_ATTN_Q_W         "v.enc.blocks.%d.%d.attn.query.proj.weight"
-#define TN_MNV5_ATTN_K_W         "v.enc.blocks.%d.%d.attn.key.proj.weight"
-#define TN_MNV5_ATTN_V_W         "v.enc.blocks.%d.%d.attn.value.proj.weight"
-#define TN_MNV5_ATTN_O_W         "v.enc.blocks.%d.%d.attn.output.proj.weight"
-#define TN_MNV5_ATTN_K_DW        "v.enc.blocks.%d.%d.attn.key.down_conv.weight"
-#define TN_MNV5_ATTN_K_NORM      "v.enc.blocks.%d.%d.attn.key.norm.weight"
-#define TN_MNV5_ATTN_V_DW        "v.enc.blocks.%d.%d.attn.value.down_conv.weight"
-#define TN_MNV5_ATTN_V_NORM      "v.enc.blocks.%d.%d.attn.value.norm.weight"
-#define TN_MNV5_ATTN_NORM        "v.enc.blocks.%d.%d.norm.weight" // Block norm used in attn blocks
+#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
 
 // MSFA
-#define TN_MNV5_MSFA_FFN_EXP_W   "v.enc.msfa.ffn.pw_exp.conv.weight"
-#define TN_MNV5_MSFA_FFN_EXP_BN  "v.enc.msfa.ffn.pw_exp.bn.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_W  "v.enc.msfa.ffn.pw_proj.conv.weight"
-#define TN_MNV5_MSFA_FFN_PROJ_BN "v.enc.msfa.ffn.pw_proj.bn.weight"
-#define TN_MNV5_MSFA_NORM        "v.enc.msfa.norm.weight"
+#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
 
 
 // align x to upper multiple of n

From d37c22b2c5dd3e551e0e18e1061c2d89d1e8f8ff Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Fri, 26 Dec 2025 20:53:11 +0000
Subject: [PATCH 11/20] Fix stem conv bias name

---
 gguf-py/gguf/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 40bb79400b0..962e66b5c23 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1072,7 +1072,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",                 # gemma3n
     MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",             # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM:           "v.conv_stem.conv",         # gemma3n
-    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.conv_stem.conv_bias",    # gemma3n
+    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.conv_stem.conv.bias",    # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.conv_stem.bn",           # gemma3n
     MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.msfa.ffn.pw_exp.conv",   # gemma3n
     MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.msfa.ffn.pw_exp.bn",     # gemma3n

From 58667f506cf1e3f0c14b4bf21042c1b1506202eb Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sat, 27 Dec 2025 18:56:24 +0000
Subject: [PATCH 12/20] Remove explicit handling of bias term for stem conv

---
 gguf-py/gguf/constants.py      | 3 ---
 gguf-py/gguf/tensor_mapping.py | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 962e66b5c23..984ff4fb11a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -670,7 +670,6 @@ class MODEL_TENSOR(IntEnum):
     V_MM_EMBEDDING       = auto() # gemma3n
     V_MM_HARD_EMB_NORM   = auto() # gemma3n
     V_ENC_CONV_STEM       = auto() # gemma3n
-    V_ENC_CONV_STEM_BIAS  = auto() # gemma3n
     V_ENC_CONV_STEM_NORM  = auto() # gemma3n
     V_ENC_MSFA_EXP        = auto() # gemma3n
     V_ENC_MSFA_EXP_NORM   = auto() # gemma3n
@@ -1072,7 +1071,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",                 # gemma3n
     MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",             # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM:           "v.conv_stem.conv",         # gemma3n
-    MODEL_TENSOR.V_ENC_CONV_STEM_BIAS:      "v.conv_stem.conv.bias",    # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.conv_stem.bn",           # gemma3n
     MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.msfa.ffn.pw_exp.conv",   # gemma3n
     MODEL_TENSOR.V_ENC_MSFA_EXP_NORM:       "v.msfa.ffn.pw_exp.bn",     # gemma3n
@@ -1180,7 +1178,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_MM_EMBEDDING,
         MODEL_TENSOR.V_MM_HARD_EMB_NORM,
         MODEL_TENSOR.V_ENC_CONV_STEM,
-        MODEL_TENSOR.V_ENC_CONV_STEM_BIAS,
         MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
         MODEL_TENSOR.V_ENC_MSFA_EXP,
         MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 9b17cb1ef7d..5e35134546f 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -135,9 +135,6 @@ class TensorNameMap:
         MODEL_TENSOR.V_ENC_CONV_STEM: (
             "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
         ),
-        MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: (
-            "model.vision_tower.timm_model.conv_stem.conv.bias", # gemma3n
-        ),
         MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
             "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
         ),

From 47b7dd13462793487c316d1401ce54917f5ca038 Mon Sep 17 00:00:00 2001
From: Simranjeet Singh <simranjeetsinghwork@gmail.com>
Date: Sat, 27 Dec 2025 19:00:54 +0000
Subject: [PATCH 13/20] - Change order of addition in
 "project_per_layer_inputs" to support broadcasting of vision inp_per_layer -
 Simplify the vision embeddings path of "get_per_layer_inputs" to output
 [n_embd_altup, n_layer, 1], broadcastable

---
 src/models/gemma3n-iswa.cpp | 54 +++++++------------------------------
 1 file changed, 10 insertions(+), 44 deletions(-)

diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp
index e172b9a79f8..39633dc3504 100644
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -248,9 +248,9 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto          inp = std::make_unique<llm_graph_input_embd>();
     ggml_tensor * inp_per_layer;
     if (ubatch.token) {
+        auto inp = std::make_unique<llm_graph_input_embd>();
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
         ggml_set_input(inp->tokens);
         res->t_tokens = inp->tokens;
@@ -258,54 +258,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
         inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
         inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
         cb(inp_per_layer, "inp_per_layer_selected", -1);
+        res->add_input(std::move(inp));
     } else {
-        // For embedding inputs (e.g., from vision encoder)
-        // Vision tokens should use the padding token (ID=0) embedding
-        // from tok_embd_per_layer, NOT project the vision embeddings.
-        // The projection happens later in project_per_layer_inputs().
-        // This matches PyTorch behavior:
-        //   per_layer_inputs_tokens = torch.where(mask, input_ids, torch.zeros_like(input_ids))
-        //   per_layer_inputs = EmbedPerLayer(per_layer_inputs_tokens)  # Uses padding (0) for vision
-
-        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-        ggml_set_input(inp->embd);
-
-        // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer
+        // Vision embedding path: use padding token (ID=0) embedding
         const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
 
-        // Create zeros tensor [embd_size, n_tokens] by projecting vision embeddings and multiplying by 0
-        // First, project inp->embd [n_embd, n_tokens] to per-layer space [embd_size, n_tokens]
-        ggml_tensor * zeros_per_layer = ggml_mul_mat(ctx0, model.per_layer_model_proj, inp->embd);
-        zeros_per_layer = ggml_scale(ctx0, zeros_per_layer, 0.0f);  // Multiply by 0 to get zeros
-        ggml_set_name(zeros_per_layer, "zeros_per_layer");
+        // Extract and dequantize padding token embedding (column 0)
+        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
+        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
 
-        // Extract column 0 (padding token's embedding) as a vector: [embd_size]
-        // Note: tok_embd_per_layer is quantized (q8_0), so the view is also q8_0
-        ggml_tensor * padding_embd_vec_q = ggml_view_1d(ctx0, model.tok_embd_per_layer,
-                                                         embd_size,  // number of elements
-                                                         0);         // offset (column 0)
-        ggml_set_name(padding_embd_vec_q, "padding_token_emb_q8");
-
-        // Dequantize to f32 using ggml_cpy
-        ggml_tensor * padding_embd_vec_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
-        ggml_tensor * padding_embd_vec = ggml_cpy(ctx0, padding_embd_vec_q, padding_embd_vec_f32);
-        ggml_set_name(padding_embd_vec, "padding_token_emb_f32");
-
-        // Reshape to [embd_size, 1] for broadcasting
-        ggml_tensor * padding_embd_col = ggml_reshape_2d(ctx0, padding_embd_vec, embd_size, 1);
-
-        // Add: zeros [embd_size, n_tokens] + padding [embd_size, 1] = broadcasted padding [embd_size, n_tokens]
-        ggml_tensor * inp_per_layer_flat = ggml_add(ctx0, zeros_per_layer, padding_embd_col);
-        ggml_set_name(inp_per_layer_flat, "inp_per_layer_broadcasted");
-
-        // Reshape to [n_embd_altup, n_layer, n_tokens] for per-layer processing
-        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer_flat, n_embd_altup, n_layer, n_tokens);
-
-        // Apply same scaling as text tokens
-        // inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
+        // Reshape to [n_embd_altup, n_layer, 1]
+        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
         cb(inp_per_layer, "inp_per_layer_vision", -1);
     }
-    res->add_input(std::move(inp));
     return inp_per_layer;
 }
 
@@ -323,7 +289,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                               -1);  // [n_embd_altup, n_layer, n_tokens]
     cb(per_layer_proj, "per_layer_proj", -1);
 
-    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
+    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
     inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
     cb(inp_per_layer, "inp_per_layer", -1);
 

From eea58817f5d40a064d5536ae7b9616eede3b62cf Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 17:13:24 +0100
Subject: [PATCH 14/20] clean up conversion script

---
 convert_hf_to_gguf.py | 53 ++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index d7df9f2d88a..4f79ceb77c7 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6041,12 +6041,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+
 @ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel")
 class Gemma3nVisionModel(MmprojModel):
     """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
-    n_block_keys = []
 
-    # Double indexed mapping for MobileNetV5 blocks
+    # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
+    # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
     block_tensor_mapping = {
         "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight":             "v.blk.{bid}.{sid}.conv_exp.weight",
         "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight":                  "v.blk.{bid}.{sid}.bn1.weight",
@@ -6072,39 +6073,24 @@ class Gemma3nVisionModel(MmprojModel):
         "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
     }
 
-    def find_hparam(self, keys: list[str], optional: bool = False) -> Any:
-        """Override to return 0 for block count since MobileNetV5 is CNN-based"""
-        if not keys:  # If n_block_keys is empty (our case)
+    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
+        # force n_layers to 0 in __init__()
+        # we have to do this because self.hparams_vision is not yet accessible for modification inside __init__()
+        if "n_layers" in list(keys):
             return 0
-        # Otherwise use parent implementation
         return super().find_hparam(keys, optional)
 
     def __init__(self, *args, **kwargs):
         # Parent init will call find_hparam which now returns 0 for empty keys
         super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.hparams_vision["n_layers"] = 0
+        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size", 2048) * 4
+        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
 
-    def find_vparam(self, keys: list[str], optional: bool = False) -> Any:
-        """Override to provide hardcoded MobileNetV5 parameters that aren't in config"""
-        # Handle empty keys list (n_block_keys) - return 0 for CNN architecture
-        if not keys:
-            return 0
-
-        if "intermediate_size" in keys:
-            # Typical expansion is 4x the embedding dimension
-            hidden_size = self.hparams_vision.get("hidden_size", 2048)
-            return hidden_size * 4
-
-        if "num_attention_heads" in keys or "num_heads" in keys:
-            # Multi-Query Attention with 8 heads
-            return 8
-
-        # For other parameters, use parent implementation
-        return super().find_vparam(keys, optional)
-
-    def set_gguf_parameters(self):
-        # MobileNetV5 does not use normalisation at all
-        self.preprocessor_config["image_mean"] = [0.0 , 0.0 , 0.0 ]
-        self.preprocessor_config["image_std"] = [1.0 , 1.0 , 1.0 ]
+        # MobileNetV5 does not use image_mean/std
+        self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
+        self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
         self.hparams_vision["image_size"] = self.preprocessor_config.get(
             "size", {"height": 768, "width": 768}
         )["height"]
@@ -6114,13 +6100,9 @@ def set_gguf_parameters(self):
         image_size = self.hparams_vision["image_size"]
         self.hparams_vision["patch_size"] = image_size // image_seq_length
 
-        # Now call parent which will use the corrected values
+    def set_gguf_parameters(self):
         super().set_gguf_parameters()
-
-        # Set projector type to GEMMA3N
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N)
-
-        # MobileNetV5 specific parameters
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
@@ -6151,8 +6133,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         # - model.embed_vision.* for projection layers
         # - model.vision_tower.* for vision encoder
         # Skip non-vision tensors
-        if not (name.startswith("model.embed_vision.") or
-                name.startswith("model.vision_tower.")):
+        if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
             return []
 
         if name.startswith("model.vision_tower.timm_model.blocks."):
@@ -6161,7 +6142,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         else:
             # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
             new_name = self.map_tensor_name(name)
-        
+
         if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
             data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
 

From bfbb3158f1c63ff9803960124cf7a3d13a222bcd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 17:16:17 +0100
Subject: [PATCH 15/20] fix code style

---
 tools/mtmd/clip-model.h           |  8 +++---
 tools/mtmd/clip.cpp               |  8 +++---
 tools/mtmd/models/mobilenetv5.cpp | 46 +++++++++++++++----------------
 tools/mtmd/models/models.h        | 26 ++++++++---------
 tools/mtmd/mtmd.cpp               |  2 +-
 5 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 3264d759f62..d4ff9151bb0 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -184,7 +184,7 @@ struct mobilenetv5_block {
     // Stage 1+ (Universal Inverted Residual)
     ggml_tensor * dw_start_w    = nullptr;
     ggml_tensor * dw_start_bn_w = nullptr;
-    
+
     ggml_tensor * pw_exp_w      = nullptr;
     ggml_tensor * pw_exp_bn_w   = nullptr;
 
@@ -201,13 +201,13 @@ struct mobilenetv5_block {
     ggml_tensor * attn_k_w = nullptr;
     ggml_tensor * attn_v_w = nullptr;
     ggml_tensor * attn_o_w = nullptr;
-    
+
     // Optional downsampling/norm in attention
     ggml_tensor * attn_k_dw_w   = nullptr;
     ggml_tensor * attn_k_norm_w = nullptr;
     ggml_tensor * attn_v_dw_w   = nullptr;
     ggml_tensor * attn_v_norm_w = nullptr;
-    
+
     // Block norm (often present in attention blocks)
     ggml_tensor * attn_norm_w   = nullptr;
 };
@@ -342,7 +342,7 @@ struct clip_model {
     ggml_tensor * msfa_ffn_expand_w = nullptr;
     ggml_tensor * msfa_ffn_project_w = nullptr;
     ggml_tensor * msfa_ffn_expand_bn = nullptr;
-    ggml_tensor * msfa_ffn_project_bn = nullptr; 
+    ggml_tensor * msfa_ffn_project_bn = nullptr;
 
 
     // pixtral, glm4v
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 04109a07786..f1b74d866f2 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1570,13 +1570,13 @@ struct clip_model_loader {
                     model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
                     model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
                     model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
-                    
+
                     model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
 
                     // Dynamically load blocks stage by stage
                     for (int stage = 0; stage < 4; ++stage) {
                         int blocks_found_in_stage = 0;
-                        
+
                         for (int blk_idx = 0; ; ++blk_idx) {
                             bool found_block = false;
                             mobilenetv5_block block;
@@ -1588,7 +1588,7 @@ struct clip_model_loader {
                                 block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
                                 block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
                                 block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
-                            } 
+                            }
                             // 2. Check for UIR (Universal Inverted Residual)
                             else {
                                 // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
@@ -1643,7 +1643,7 @@ struct clip_model_loader {
                                 break;
                             }
                         }
-                        
+
                         // Track where this stage ends in the flat vector
                         if (blocks_found_in_stage > 0) {
                             model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 1a5c61fb581..78db081ea32 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -8,7 +8,7 @@ ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor
     ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
     cur = ggml_cont(ctx0, cur);
     cur = ggml_rms_norm(ctx0, cur, eps);
- 
+
     if (weight) {
         cur = ggml_mul(ctx0, cur, weight);
     }
@@ -61,7 +61,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, con
     if (stride == 2) {
         // Case: Downsampling (Block 0)
         // Replicates Conv2dSame(kernel=3, stride=2)
-        cur = pad_same_2d(cur, 3, 3, stride, stride); 
+        cur = pad_same_2d(cur, 3, 3, stride, stride);
         cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
     } else {
         // Case: Normal 3x3 Block (Block 1, 2)
@@ -112,7 +112,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp,
     // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
     if (block.dw_mid_w) {
         int k = block.dw_mid_w->ne[0]; // 3 or 5
-        
+
         if (stride > 1) {
             // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
             cur = pad_same_2d(cur, k, k, stride, stride);
@@ -134,7 +134,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp,
     }
 
     // Apply Layer Scaling if present
-    if (block.layer_scale_w) {        
+    if (block.layer_scale_w) {
         cur = ggml_mul(ctx0, cur, block.layer_scale_w);
     }
 
@@ -148,7 +148,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp,
     return cur;
 }
 
-// Attention Block (MQA) 
+// Attention Block (MQA)
 ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
     ggml_tensor * cur = inp;
 
@@ -198,7 +198,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
     q = ggml_cont(ctx0, q);
 
     const int Wk = k->ne[0]; const int Hk = k->ne[1];
-    const int M = Wk * Hk; 
+    const int M = Wk * Hk;
 
     // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
     k = ggml_reshape_3d(ctx0, k, M, D, B);
@@ -225,7 +225,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co
 
     kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
     kqv = ggml_cont(ctx0, kqv);
-    
+
 
     kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
     kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
@@ -262,7 +262,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // 2. Blocks
     std::vector<ggml_tensor*> intermediate_features;
     const int total_blocks = model.mobilenet_blocks.size();
-    
+
     auto is_stage_start = [&](int i) {
         if (i == 0) return true;
         for (int end_idx : model.mobilenet_stage_ends) {
@@ -297,7 +297,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     // 3. Multi-Scale Fusion Adapter (MSFA)
     if (!intermediate_features.empty()) {
-        
+
         // A. Reference Resolution: PyTorch implementation uses inputs[0]
         // We assume intermediate_features[0] is the "High Resolution" target.
         // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
@@ -314,21 +314,21 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
             // PyTorch: if feat_size < high_resolution: interpolate
             if (feat_w < high_res_w || feat_h < high_res_h) {
-                // Calculate scale factor. 
-                // Note: PyTorch 'nearest' works on arbitrary float scales. 
+                // Calculate scale factor.
+                // Note: PyTorch 'nearest' works on arbitrary float scales.
                 // ggml_upscale generally takes integer factors or target sizes depending on helper.
                 // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
                 int scale_w = high_res_w / feat_w;
                 // int scale_h = high_res_h / feat_h;
-                
+
                 // Safety check for non-integer scaling if strictly replicating
-                if (high_res_w % feat_w != 0) { 
+                if (high_res_w % feat_w != 0) {
                     LOG_WRN("%s: non-integer scaling detected\n", __func__);
                 }
 
                 // Upsample (Nearest Neighbor)
                 // 2 is the scale factor
-                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); 
+                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
             }
             resized_feats.push_back(feat);
         }
@@ -341,16 +341,16 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
         // D. FFN (UniversalInvertedResidual)
         // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
-        
+
         // 1. Expansion
         if (model.msfa_ffn_expand_w) {
             // 1x1 Conv
             cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
-            
+
             if (model.msfa_ffn_expand_bn) {
                 cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
             }
-            
+
             cur = ggml_gelu(ctx0, cur);
 
         }
@@ -359,7 +359,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         if (model.msfa_ffn_project_w) {
             // 1x1 Conv
             cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
-            
+
             // UniversalInvertedResidual typically has a norm after projection
             if (model.msfa_ffn_project_bn) {
                 cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
@@ -369,7 +369,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
         // E. Final Downsample to Target Resolution (Output Resolution)
         // PyTorch: matches self.output_resolution (e.g. 16x16)
-        const int target_out_res = 16; 
+        const int target_out_res = 16;
         int current_w = cur->ne[0];
 
         if (current_w > target_out_res) {
@@ -418,8 +418,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // We must normalize regardless, then multiply if weight exists.
     {
         const float eps = 1e-6f; // Gemma3n uses 1e-6
-        cur = ggml_rms_norm(ctx0, cur, eps); 
-        
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
         if (model.mm_soft_emb_norm_w) {
             // Weight shape is (2048,) -> Element-wise broadcast multiply
             cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
@@ -431,7 +431,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
     // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
     if (model.mm_input_proj_w) {
-        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);         
+        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
     }
 
     // 5. POST PROJECTION NORM
@@ -450,4 +450,4 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
 
     ggml_build_forward_expand(gf, cur);
     return gf;
-}
\ No newline at end of file
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index bb88d11a8a9..9970980c7bc 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -82,30 +82,30 @@ struct clip_graph_mobilenetv5 : clip_graph {
     ggml_cgraph * build() override;
 
     ggml_tensor * rms_norm_2d(
-        ggml_tensor * inp, 
-        ggml_tensor * weight, 
+        ggml_tensor * inp,
+        ggml_tensor * weight,
         float eps = 1e-6f);
-    
+
     ggml_tensor* pad_same_2d(
-        ggml_tensor* inp, 
-        int kernel_h, 
-        int kernel_w, 
-        int stride_h, 
-        int stride_w, 
-        int dilation_h = 1, 
+        ggml_tensor* inp,
+        int kernel_h,
+        int kernel_w,
+        int stride_h,
+        int stride_w,
+        int dilation_h = 1,
         int dilation_w = 1);
-        
+
     ggml_tensor * build_edge_residual(
         ggml_tensor * inp,
         const mobilenetv5_block & block,
         int stride);
 
     ggml_tensor * build_inverted_residual(
-        ggml_tensor * inp, 
-        const mobilenetv5_block & block, 
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
         int stride);
 
     ggml_tensor * build_mobilenet_attn(
-        ggml_tensor * inp, 
+        ggml_tensor * inp,
         const mobilenetv5_block & block);
 };
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 363873d3ff7..23cc8ffd30d 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -862,7 +862,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    if (ctx->ctx_v && 
+    if (ctx->ctx_v &&
         (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) {
         return true;
     }

From 395d2d412bc78d1432bc6bdfe5aff4738ecd37e0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 18:43:54 +0100
Subject: [PATCH 16/20] also preserve audio tensors

---
 convert_hf_to_gguf.py          | 162 ++++++++++++++++++---------------
 gguf-py/gguf/constants.py      |  39 +++++++-
 gguf-py/gguf/tensor_mapping.py |  61 +++++++++++++
 tools/mtmd/clip.cpp            |  11 ++-
 4 files changed, 195 insertions(+), 78 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4f79ceb77c7..cefaaa712d9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6042,9 +6042,72 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [] # skip other tensors
 
 
-@ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel")
-class Gemma3nVisionModel(MmprojModel):
-    """Vision encoder converter for Gemma3n using MobileNetV5 architecture"""
+class ConformerAudioModel(MmprojModel):
+    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def is_audio_tensor(name: str):
+        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ConformerAudioModel.is_audio_tensor(name):
+            if ".conv" in name or "_conv" in name and ".weight" in name:
+                return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # skip language model tensors
+        if name.startswith("lfm."):
+            return []
+
+        # for training only
+        if any(p in name for p in ["audio_loss_weight"]):
+            return []
+
+        # for audio output
+        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
+            return []
+
+        # fold running_mean, running_var and eps into weight and bias for batch_norm
+        if "batch_norm" in name:
+            if self._batch_norm_tensors is None:
+                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+            assert bid is not None
+            self._batch_norm_tensors[bid][name] = data_torch
+
+            if len(self._batch_norm_tensors[bid]) < 5:
+                return []
+
+            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+            eps = 1e-5 # default value
+
+            a = weight / torch.sqrt(running_var + eps)
+            b = bias - running_mean * a
+            return [
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+            ]
+
+        # reshape conv weights
+        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+            data_torch = data_torch[:, None, None]
+        if "conv.depthwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[1] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+        if "conv.pointwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[2] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@ModelBase.register("Gemma3nForConditionalGeneration")
+class Gemma3nVisionAudioModel(ConformerAudioModel):
+    has_audio_encoder = True
+    has_vision_encoder = True
 
     # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
     # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
@@ -6073,19 +6136,12 @@ class Gemma3nVisionModel(MmprojModel):
         "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight":                 "v.blk.{bid}.{sid}.norm.weight",
     }
 
-    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
-        # force n_layers to 0 in __init__()
-        # we have to do this because self.hparams_vision is not yet accessible for modification inside __init__()
-        if "n_layers" in list(keys):
-            return 0
-        return super().find_hparam(keys, optional)
-
     def __init__(self, *args, **kwargs):
         # Parent init will call find_hparam which now returns 0 for empty keys
         super().__init__(*args, **kwargs)
         assert self.hparams_vision is not None
-        self.hparams_vision["n_layers"] = 0
-        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size", 2048) * 4
+        self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
+        self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
         self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
 
         # MobileNetV5 does not use image_mean/std
@@ -6100,11 +6156,25 @@ def __init__(self, *args, **kwargs):
         image_size = self.hparams_vision["image_size"]
         self.hparams_vision["patch_size"] = image_size // image_seq_length
 
+        # remap audio hparams
+        assert self.hparams_audio is not None
+        self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
+        self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
+        self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
+        self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N)
+
+        # vision params
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
 
+        # audio params
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         # Force quantization settings for specific tensor types
         if "input_projection" in name or "input_proj" in name:
@@ -6127,7 +6197,9 @@ def custom_map(self, name: str) -> str:
         raise ValueError(f"Unknown name: {name}")
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
+        if (ConformerAudioModel.is_audio_tensor(name)):
+            name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
+            return super().modify_tensors(data_torch, name, bid)
 
         # Gemma3n uses
         # - model.embed_vision.* for projection layers
@@ -6146,7 +6218,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
             data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
 
-        yield (new_name, data_torch)
+        return [(new_name, data_torch)]
 
 
 @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
@@ -10088,7 +10160,7 @@ def set_gguf_parameters(self):
         self._add_feed_forward_length()
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self._is_vision_tensor(name) or self._is_audio_tensor(name):
+        if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
             # skip multimodal tensors
             return []
 
@@ -10104,9 +10176,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
     def _is_vision_tensor(self, name: str) -> bool:
         return "vision_tower" in name or "multi_modal_projector" in name
 
-    def _is_audio_tensor(self, name: str):
-        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
-
 
 @ModelBase.register("Lfm2Model")
 class LFM2ColBertModel(LFM2Model):
@@ -10234,13 +10303,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
 
 @ModelBase.register("Lfm2AudioForConditionalGeneration")
-class LFM2AudioModel(MmprojModel):
+class LFM2AudioModel(ConformerAudioModel):
     has_vision_encoder = False
     has_audio_encoder = True
     model_name = "Lfm2AudioEncoder"
 
-    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
-
     def get_audio_config(self) -> dict[str, Any] | None:
         return self.global_config.get("encoder")
 
@@ -10254,59 +10321,6 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
         self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
 
-    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".conv" in name and ".weight" in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # skip language model tensors
-        if name.startswith("lfm."):
-            return []
-
-        # for training only
-        if any(p in name for p in ["audio_loss_weight"]):
-            return []
-
-        # for audio output
-        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
-            return []
-
-        # fold running_mean, running_var and eps into weight and bias for batch_norm
-        if "batch_norm" in name:
-            if self._batch_norm_tensors is None:
-                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
-            assert bid is not None
-            self._batch_norm_tensors[bid][name] = data_torch
-
-            if len(self._batch_norm_tensors[bid]) < 5:
-                return []
-
-            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
-            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
-            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
-            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
-            eps = 1e-5 # default value
-
-            a = weight / torch.sqrt(running_var + eps)
-            b = bias - running_mean * a
-            return [
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
-                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
-            ]
-
-        # reshape conv weights
-        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
-            data_torch = data_torch[:, None, None]
-        if "conv.depthwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[1] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
-        if "conv.pointwise_conv" in name and name.endswith(".weight"):
-            assert data_torch.shape[2] == 1
-            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
-
-        return [(self.map_tensor_name(name), data_torch)]
-
 
 @ModelBase.register("SmallThinkerForCausalLM")
 class SmallThinkerModel(TextModel):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 44ca7d553cc..d0605d4d939 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -712,26 +712,37 @@ class MODEL_TENSOR(IntEnum):
     A_ENC_EMBD_NORM      = auto()
     A_ENC_EMBD_TO_LOGITS = auto()
     A_ENC_CONV1D         = auto()
+    A_ENC_CONV1D_NORM    = auto() # gemma3n
     A_PRE_NORM           = auto()
     A_POST_NORM          = auto()
+    A_ENC_LAYER_PRE_NORM = auto() # gemma3n
     A_ENC_ATTN_Q         = auto()
     A_ENC_ATTN_K         = auto()
     A_ENC_ATTN_V         = auto()
+    A_ENC_PER_DIM_SCALE  = auto() # gemma3n
     A_ENC_INPUT_NORM     = auto()
     A_ENC_OUTPUT         = auto()
     A_ENC_OUTPUT_NORM    = auto()
     A_ENC_FFN_UP         = auto()
     A_ENC_FFN_NORM       = auto()
+    A_ENC_FFN_POST_NORM  = auto() # gemma3n
+    A_ENC_FFN_SCALE      = auto() # gemma3n
     A_ENC_FFN_GATE       = auto()
     A_ENC_FFN_DOWN       = auto()
     A_ENC_FFN_UP_1       = auto()
     A_ENC_FFN_NORM_1     = auto()
+    A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
+    A_ENC_FFN_SCALE_1    = auto() # gemma3n
     A_ENC_FFN_GATE_1     = auto()
     A_ENC_FFN_DOWN_1     = auto()
     A_MMPROJ             = auto()
     A_MMPROJ_FC          = auto()
     A_MM_NORM_PRE        = auto()
     A_MM_NORM_MID        = auto()
+    A_MM_EMBEDDING       = auto() # gemma3n
+    A_MM_HARD_EMB_NORM   = auto() # gemma3n
+    A_MM_SOFT_EMB_NORM   = auto() # gemma3n
+    A_MM_INP_PROJ        = auto() # gemma3n
     # nextn/mtp
     NEXTN_EH_PROJ        = auto()
     NEXTN_EMBED_TOKENS   = auto()
@@ -1081,9 +1092,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_MM_POST_NORM:            "mm.post_norm",
     MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
     MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
-    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",             # gemma3n
-    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",                 # gemma3n
-    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",             # gemma3n
+    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",         # gemma3n
+    MODEL_TENSOR.V_MM_EMBEDDING:            "mm.embedding",             # gemma3n
+    MODEL_TENSOR.V_MM_HARD_EMB_NORM:        "mm.hard_emb_norm",         # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM:           "v.conv_stem.conv",         # gemma3n
     MODEL_TENSOR.V_ENC_CONV_STEM_NORM:      "v.conv_stem.bn",           # gemma3n
     MODEL_TENSOR.V_ENC_MSFA_EXP:            "v.msfa.ffn.pw_exp.conv",   # gemma3n
@@ -1119,19 +1130,26 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.A_ENC_EMBD_NORM:           "a.position_embd_norm",
     MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS:      "a.embd_to_logits",
     MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
+    MODEL_TENSOR.A_ENC_CONV1D_NORM:         "a.conv1d.{bid}.norm",
     MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
     MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
+    MODEL_TENSOR.A_ENC_LAYER_PRE_NORM:      "a.blk.{bid}.layer_pre_norm",
     MODEL_TENSOR.A_ENC_ATTN_Q:              "a.blk.{bid}.attn_q",
     MODEL_TENSOR.A_ENC_ATTN_K:              "a.blk.{bid}.attn_k",
     MODEL_TENSOR.A_ENC_ATTN_V:              "a.blk.{bid}.attn_v",
+    MODEL_TENSOR.A_ENC_PER_DIM_SCALE:       "a.blk.{bid}.per_dim_scale",
     MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
     MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
     MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
     MODEL_TENSOR.A_ENC_FFN_NORM:            "a.blk.{bid}.ffn_norm",
+    MODEL_TENSOR.A_ENC_FFN_POST_NORM:       "a.blk.{bid}.ffn_post_norm",
+    MODEL_TENSOR.A_ENC_FFN_SCALE:           "a.blk.{bid}.ffn_scale",
     MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
     MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
     MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
     MODEL_TENSOR.A_ENC_FFN_NORM_1:          "a.blk.{bid}.ffn_norm_1",
+    MODEL_TENSOR.A_ENC_FFN_POST_NORM_1:     "a.blk.{bid}.ffn_post_norm_1",
+    MODEL_TENSOR.A_ENC_FFN_SCALE_1:         "a.blk.{bid}.ffn_scale_1",
     MODEL_TENSOR.A_ENC_FFN_UP_1:            "a.blk.{bid}.ffn_up_1",
     MODEL_TENSOR.A_ENC_FFN_GATE_1:          "a.blk.{bid}.ffn_gate_1",
     MODEL_TENSOR.A_ENC_FFN_DOWN_1:          "a.blk.{bid}.ffn_down_1",
@@ -1139,6 +1157,10 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.A_MMPROJ_FC:               "mm.a.fc",
     MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
     MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
+    MODEL_TENSOR.A_MM_INP_PROJ:             "mm.a.input_projection",      # gemma3n
+    MODEL_TENSOR.A_MM_SOFT_EMB_NORM:        "mm.a.soft_emb_norm",         # gemma3n
+    MODEL_TENSOR.A_MM_EMBEDDING:            "mm.a.embedding",             # gemma3n
+    MODEL_TENSOR.A_MM_HARD_EMB_NORM:        "mm.a.hard_emb_norm",         # gemma3n
     # lfm2 audio
     MODEL_TENSOR.A_ENC_NORM_CONV:           "a.blk.{bid}.norm_conv",
     MODEL_TENSOR.A_ENC_LINEAR_POS:          "a.blk.{bid}.linear_pos",
@@ -1225,19 +1247,26 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.A_ENC_EMBD_NORM,
         MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
         MODEL_TENSOR.A_ENC_CONV1D,
+        MODEL_TENSOR.A_ENC_CONV1D_NORM,
         MODEL_TENSOR.A_PRE_NORM,
         MODEL_TENSOR.A_POST_NORM,
+        MODEL_TENSOR.A_ENC_LAYER_PRE_NORM,
         MODEL_TENSOR.A_ENC_ATTN_Q,
         MODEL_TENSOR.A_ENC_ATTN_K,
         MODEL_TENSOR.A_ENC_ATTN_V,
+        MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
         MODEL_TENSOR.A_ENC_INPUT_NORM,
         MODEL_TENSOR.A_ENC_OUTPUT,
         MODEL_TENSOR.A_ENC_OUTPUT_NORM,
         MODEL_TENSOR.A_ENC_FFN_NORM,
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM,
+        MODEL_TENSOR.A_ENC_FFN_SCALE,
         MODEL_TENSOR.A_ENC_FFN_UP,
         MODEL_TENSOR.A_ENC_FFN_GATE,
         MODEL_TENSOR.A_ENC_FFN_DOWN,
         MODEL_TENSOR.A_ENC_FFN_NORM_1,
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM_1,
+        MODEL_TENSOR.A_ENC_FFN_SCALE_1,
         MODEL_TENSOR.A_ENC_FFN_UP_1,
         MODEL_TENSOR.A_ENC_FFN_GATE_1,
         MODEL_TENSOR.A_ENC_FFN_DOWN_1,
@@ -1254,6 +1283,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.A_ENC_CONV_NORM,
         MODEL_TENSOR.A_ENC_CONV_PW1,
         MODEL_TENSOR.A_ENC_CONV_PW2,
+        MODEL_TENSOR.A_MM_INP_PROJ,
+        MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
+        MODEL_TENSOR.A_MM_EMBEDDING,
+        MODEL_TENSOR.A_MM_HARD_EMB_NORM,
     ],
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 396beb6221d..003d986941f 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1609,6 +1609,11 @@ class TensorNameMap:
         MODEL_TENSOR.A_ENC_CONV1D: (
             "audio_tower.conv{bid}", # ultravox
             "conformer.pre_encode.conv.{bid}", # lfm2
+            "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV1D_NORM: (
+            "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
         ),
 
         MODEL_TENSOR.A_PRE_NORM: (),
@@ -1621,40 +1626,64 @@ class TensorNameMap:
         MODEL_TENSOR.A_ENC_ATTN_Q: (
             "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
             "conformer.layers.{bid}.self_attn.linear_q", # lfm2
+            "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_ATTN_K: (
             "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
             "conformer.layers.{bid}.self_attn.linear_k", # lfm2
+            "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_ATTN_V: (
             "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
             "conformer.layers.{bid}.self_attn.linear_v", # lfm2
+            "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
+            "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
+            "conformer.layers.{bid}.norm", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_INPUT_NORM: (
             "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
             "conformer.layers.{bid}.norm_self_att", # lfm2
+            "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_OUTPUT: (
             "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
             "conformer.layers.{bid}.self_attn.linear_out", # lfm2
+            "conformer.layers.{bid}.attention.post", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
             "audio_tower.layers.{bid}.final_layer_norm", # ultravox
             "conformer.layers.{bid}.norm_out", # lfm2
+            "conformer.layers.{bid}.attention.post_norm", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_NORM: (
             "conformer.layers.{bid}.norm_feed_forward1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
+            "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_SCALE: (
+            "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_UP: (
             "audio_tower.layers.{bid}.fc1", # ultravox
             "conformer.layers.{bid}.feed_forward1.linear1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_GATE: (),
@@ -1662,22 +1691,35 @@ class TensorNameMap:
         MODEL_TENSOR.A_ENC_FFN_DOWN: (
             "audio_tower.layers.{bid}.fc2", # ultravox
             "conformer.layers.{bid}.feed_forward1.linear2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_UP_1: (
             "conformer.layers.{bid}.feed_forward2.linear1", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
             "conformer.layers.{bid}.feed_forward2.linear2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_FFN_NORM_1: (
             "conformer.layers.{bid}.norm_feed_forward2", # lfm2
+            "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
+            "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
+            "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_LINEAR_POS: (
             "conformer.layers.{bid}.self_attn.linear_pos", # lfm2
+            "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_POS_BIAS_U: (
@@ -1690,6 +1732,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_ENC_OUT: (
             "conformer.pre_encode.out", # lfm2
+            "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
         ),
 
         # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@@ -1715,22 +1758,40 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_ENC_CONV_DW: (
             "conformer.layers.{bid}.conv.depthwise_conv", # lfm2
+            "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_CONV_NORM: (
             "conformer.layers.{bid}.conv.batch_norm", # lfm2
+            "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_CONV_PW1: (
             "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
+            "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_CONV_PW2: (
             "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
+            "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
         ),
 
         MODEL_TENSOR.A_ENC_NORM_CONV: (
             "conformer.layers.{bid}.norm_conv", # lfm2
+            "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
+        ),
+
+        MODEL_TENSOR.A_MM_EMBEDDING: (
+            "model.embed_audio.embedding", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
+            "model.embed_audio.hard_embedding_norm", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_INP_PROJ: (
+            "model.embed_audio.embedding_projection", # gemma3n
+        ),
+        MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
+            "model.embed_audio.soft_embedding_norm", # gemma3n
         ),
 
         # NextN/MTP tensors for GLM4_MOE
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f1b74d866f2..be383f01a1d 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1346,6 +1346,10 @@ struct clip_model_loader {
 
         model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
 
+        if (model.proj_type == PROJECTOR_TYPE_GEMMA3N) {
+            hparams.n_layer = 0; // gemma3n does not use normal layer structure
+        }
+
         // layers
         model.layers.resize(hparams.n_layer);
         for (int il = 0; il < hparams.n_layer; ++il) {
@@ -2108,6 +2112,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
 
     try {
         clip_model_loader loader(fname);
+        bool skip_audio = false;
 
         if (loader.has_vision) {
             ctx_vision = new clip_ctx(ctx_params);
@@ -2116,11 +2121,15 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
             if (ctx_params.warmup) {
                 loader.warmup(*ctx_vision);
             }
+        
+            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
+            // we can remove this check when we implement audio support for Gemma 3N
+            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N;
 
             // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
         }
 
-        if (loader.has_audio) {
+        if (loader.has_audio && !skip_audio) {
             ctx_audio = new clip_ctx(ctx_params);
             loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
             loader.load_tensors(*ctx_audio);

From 6a68b35e7e73dfeb7d177b647231a85f6dd4242a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 18:44:45 +0100
Subject: [PATCH 17/20] trailing space

---
 tools/mtmd/clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index be383f01a1d..81a8ff04c1f 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2121,7 +2121,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
             if (ctx_params.warmup) {
                 loader.warmup(*ctx_vision);
             }
-        
+
             // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
             // we can remove this check when we implement audio support for Gemma 3N
             skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N;

From e842b9314a76bb4d7c346c607f08338438f48bb6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 22:16:15 +0100
Subject: [PATCH 18/20] split arch A and V

---
 convert_hf_to_gguf.py             | 30 +++++++++++++++++-------------
 gguf-py/gguf/constants.py         | 13 ++++++++-----
 gguf-py/gguf/gguf_writer.py       |  6 ++++++
 tools/mtmd/clip-impl.h            |  6 ++++--
 tools/mtmd/clip.cpp               | 18 +++++++++---------
 tools/mtmd/models/mobilenetv5.cpp | 18 ++++++++----------
 tools/mtmd/mtmd.cpp               | 14 +++++++++-----
 7 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index cefaaa712d9..ead180523c8 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6056,18 +6056,6 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
         return super().tensor_force_quant(name, new_name, bid, n_dims)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # skip language model tensors
-        if name.startswith("lfm."):
-            return []
-
-        # for training only
-        if any(p in name for p in ["audio_loss_weight"]):
-            return []
-
-        # for audio output
-        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
-            return []
-
         # fold running_mean, running_var and eps into weight and bias for batch_norm
         if "batch_norm" in name:
             if self._batch_norm_tensors is None:
@@ -6165,13 +6153,14 @@ def __init__(self, *args, **kwargs):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N)
 
         # vision params
+        self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
 
         # audio params
         assert self.hparams_audio is not None
+        self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
         self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
         self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
 
@@ -10321,6 +10310,21 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
         self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
 
+    def modify_tensors(self, data_torch, name, bid):
+        # skip language model tensors
+        if name.startswith("lfm."):
+            return []
+
+        # for training only
+        if any(p in name for p in ["audio_loss_weight"]):
+            return []
+
+        # for audio output
+        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
+            return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
 
 @ModelBase.register("SmallThinkerForCausalLM")
 class SmallThinkerModel(TextModel):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index d0605d4d939..3e549883577 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -276,12 +276,13 @@ class IMatrix:
         DATASETS    = "imatrix.datasets"
 
     class Clip:
-        PROJECTOR_TYPE      = "clip.projector_type"
-        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
-        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
-        HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+        PROJECTOR_TYPE        = "clip.projector_type"
+        HAS_VISION_ENCODER    = "clip.has_vision_encoder"
+        HAS_AUDIO_ENCODER     = "clip.has_audio_encoder"
+        HAS_LLAVA_PROJECTOR   = "clip.has_llava_projector"
 
     class ClipVision:
+        PROJECTOR_TYPE      = "clip.vision.projector_type" # for mixed modality models
         IMAGE_SIZE          = "clip.vision.image_size"
         PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
         PATCH_SIZE          = "clip.vision.patch_size"
@@ -307,6 +308,7 @@ class Projector:
             SCALE_FACTOR    = "clip.vision.projector.scale_factor"
 
     class ClipAudio:
+        PROJECTOR_TYPE      = "clip.audio.projector_type" # for mixed modality models
         NUM_MEL_BINS        = "clip.audio.num_mel_bins"
         EMBEDDING_LENGTH    = "clip.audio.embedding_length"
         FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
@@ -3557,7 +3559,8 @@ def get_type(val: Any) -> GGUFValueType:
 
 class VisionProjectorType:
     GEMMA3 = "gemma3"
-    GEMMA3N = "gemma3n"
+    GEMMA3NV = "gemma3nv"
+    GEMMA3NA = "gemma3na"
     IDEFICS3 = "idefics3"
     PIXTRAL = "pixtral"
     LLAMA4 = "llama4"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a7506aa7934..7fbb78866bc 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1086,6 +1086,9 @@ def add_clip_has_audio_encoder(self, value: bool) -> None:
     def add_clip_projector_type(self, value: str) -> None:
         self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
 
+    def add_clip_vision_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
     def add_vision_projection_dim(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
 
@@ -1168,6 +1171,9 @@ def add_vision_window_size(self, value: int) -> None:
 
     # audio models
 
+    def add_clip_audio_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value)
+
     def add_audio_projection_dim(self, value: int) -> None:
         self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
 
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index c6ace9d81bb..dd693623a26 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -212,7 +212,8 @@ enum projector_type {
     PROJECTOR_TYPE_QWEN2VL,
     PROJECTOR_TYPE_QWEN3VL,
     PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_GEMMA3N,
+    PROJECTOR_TYPE_GEMMA3NV,
+    PROJECTOR_TYPE_GEMMA3NA,
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
@@ -245,7 +246,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
     { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_GEMMA3N,   "gemma3n"},
+    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
+    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
     { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 81a8ff04c1f..9e941638da7 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -788,7 +788,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_siglip>(ctx, img);
             } break;
-        case PROJECTOR_TYPE_GEMMA3N:
+        case PROJECTOR_TYPE_GEMMA3NV:
             {
                 builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
             } break;
@@ -1151,7 +1151,7 @@ struct clip_model_loader {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                     } break;
 
-                case PROJECTOR_TYPE_GEMMA3N:
+                case PROJECTOR_TYPE_GEMMA3NV:
                     {
                         // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
                         // Similar configuration to Gemma3
@@ -1346,7 +1346,7 @@ struct clip_model_loader {
 
         model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
 
-        if (model.proj_type == PROJECTOR_TYPE_GEMMA3N) {
+        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
             hparams.n_layer = 0; // gemma3n does not use normal layer structure
         }
 
@@ -1564,7 +1564,7 @@ struct clip_model_loader {
                     model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                     model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                 } break;
-            case PROJECTOR_TYPE_GEMMA3N:
+            case PROJECTOR_TYPE_GEMMA3NV:
                 {
                     model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
                     model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
@@ -2124,7 +2124,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
 
             // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
             // we can remove this check when we implement audio support for Gemma 3N
-            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N;
+            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
 
             // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
         }
@@ -2967,7 +2967,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 res_imgs->entries.push_back(std::move(img_f32));
             } break;
 
-        case PROJECTOR_TYPE_GEMMA3N:
+        case PROJECTOR_TYPE_GEMMA3NV:
             {
                 clip_image_u8 resized_image;
                 int sz = params.image_size;
@@ -3239,7 +3239,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 int scale_factor = ctx->model.hparams.n_merge;
                 n_patches /= (scale_factor * scale_factor);
             } break;
-        case PROJECTOR_TYPE_GEMMA3N:
+        case PROJECTOR_TYPE_GEMMA3NV:
             {
                 // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
                 // regardless of input size (see architecture description)
@@ -3637,7 +3637,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 set_input_i32("patches", patches);
             } break;
         case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3N:
+        case PROJECTOR_TYPE_GEMMA3NV:
         case PROJECTOR_TYPE_IDEFICS3:
         case PROJECTOR_TYPE_INTERNVL:
         case PROJECTOR_TYPE_QWEN2A:
@@ -3765,7 +3765,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             // main path + deepstack paths
             return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
         case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_GEMMA3N:
+        case PROJECTOR_TYPE_GEMMA3NV:
             return ctx->model.mm_input_proj_w->ne[0];
         case PROJECTOR_TYPE_IDEFICS3:
             return ctx->model.projection->ne[1];
diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp
index 78db081ea32..593afa1ddce 100644
--- a/tools/mtmd/models/mobilenetv5.cpp
+++ b/tools/mtmd/models/mobilenetv5.cpp
@@ -322,9 +322,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
                 // int scale_h = high_res_h / feat_h;
 
                 // Safety check for non-integer scaling if strictly replicating
-                if (high_res_w % feat_w != 0) {
-                    LOG_WRN("%s: non-integer scaling detected\n", __func__);
-                }
+                GGML_ASSERT(high_res_w % feat_w == 0);
 
                 // Upsample (Nearest Neighbor)
                 // 2 is the scale factor
@@ -375,12 +373,10 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
         if (current_w > target_out_res) {
             int s = current_w / target_out_res;
 
-            if (current_w % target_out_res == 0) {
-                // Avg Pool: Kernel=s, Stride=s
-                cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
-            } else {
-                LOG_ERR("%s: irregular downsampling stride required\n", __func__);
-            }
+            GGML_ASSERT(current_w % target_out_res == 0);
+
+            // Avg Pool: Kernel=s, Stride=s
+            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
 
         }
 
@@ -395,9 +391,11 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
     // Input: 'cur' is [Width, Height, Channels, Batch]
     int W = cur->ne[0];
     int H = cur->ne[1];
-    int C = cur->ne[2]; // Should be 2048
+    int C = cur->ne[2];
     int B = cur->ne[3];
 
+    GGML_ASSERT(C == hparams.n_embd);
+
     // 1. Permute and Flatten to [Channels, Tokens, Batch]
     // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
     cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 23cc8ffd30d..b68de74296e 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -266,7 +266,7 @@ struct mtmd_context {
         }
 
         // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3N) {
+        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
             // <start_of_image> ... (image embeddings) ... <end_of_image>
             img_beg = "<start_of_image>";
             img_end = "<end_of_image>";
@@ -862,11 +862,15 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    if (ctx->ctx_v &&
-        (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) {
-        return true;
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_YOUTUVL:
+            return true;
+        default:
+            return false;
     }
-    return false;
 }
 
 bool mtmd_decode_use_mrope(mtmd_context * ctx) {

From 8f6dbbe4c1b3443455918d4be4ff27c7e955e1f9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 22:20:17 +0100
Subject: [PATCH 19/20] rm unused gemma3 func

---
 tools/mtmd/clip.cpp | 34 +++++++++++++++++++++-------------
 tools/mtmd/clip.h   |  3 ++-
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9e941638da7..97c83de5fb3 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3796,6 +3796,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 }
 
 int clip_is_minicpmv(const struct clip_ctx * ctx) {
+    // TODO: remove this function
     if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
         return ctx->model.hparams.minicpmv_version;
     }
@@ -3803,24 +3804,26 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
 }
 
 bool clip_is_glm(const struct clip_ctx * ctx) {
+    // TODO: remove this function
     return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }
 
 bool clip_is_mrope(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
-        || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return true;
+        default:
+            return false;
+    }
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {
     return ctx->model.hparams.has_llava_projector;
 }
 
-bool clip_is_gemma3(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
-}
-
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_VISION;
 }
@@ -3830,11 +3833,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
 }
 
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
-        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
-        || ctx->proj_type() == PROJECTOR_TYPE_GLMA
-        || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
-        || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_GLMA:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            return true;
+        default:
+            return false;
+    }
 }
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 68a0d6e857e..79df0136ba7 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -106,7 +106,8 @@ int clip_is_minicpmv(const struct clip_ctx * ctx);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_mrope(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
-bool clip_is_gemma3(const struct clip_ctx * ctx);
+// note for contributor: this clip_is_(model) pattern is deprecated
+//                       do NOT add new functions like this
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 

From 60c23c9a532bca6d123596710fe67c2e98892b87 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 9 Jan 2026 22:27:42 +0100
Subject: [PATCH 20/20] fix alignment

---
 gguf-py/gguf/constants.py | 82 +++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 3e549883577..b240e8e4a6b 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -680,13 +680,13 @@ class MODEL_TENSOR(IntEnum):
     V_MM_SOFT_EMB_NORM   = auto() # gemma3
     V_MM_EMBEDDING       = auto() # gemma3n
     V_MM_HARD_EMB_NORM   = auto() # gemma3n
-    V_ENC_CONV_STEM       = auto() # gemma3n
-    V_ENC_CONV_STEM_NORM  = auto() # gemma3n
-    V_ENC_MSFA_EXP        = auto() # gemma3n
-    V_ENC_MSFA_EXP_NORM   = auto() # gemma3n
-    V_ENC_MSFA_PROJ       = auto() # gemma3n
-    V_ENC_MSFA_PROJ_NORM  = auto() # gemma3n
-    V_ENC_MSFA_NORM       = auto() # gemma3n
+    V_ENC_CONV_STEM      = auto() # gemma3n
+    V_ENC_CONV_STEM_NORM = auto() # gemma3n
+    V_ENC_MSFA_EXP       = auto() # gemma3n
+    V_ENC_MSFA_EXP_NORM  = auto() # gemma3n
+    V_ENC_MSFA_PROJ      = auto() # gemma3n
+    V_ENC_MSFA_PROJ_NORM = auto() # gemma3n
+    V_ENC_MSFA_NORM      = auto() # gemma3n
     V_RESMPL_POS_EMBD_K  = auto() # minicpmv
     V_RESMPL_ATTN_Q      = auto() # minicpmv
     V_RESMPL_ATTN_K      = auto() # minicpmv
@@ -710,41 +710,41 @@ class MODEL_TENSOR(IntEnum):
     V_TOK_BOI            = auto() # cogvlm
     V_TOK_EOI            = auto() # cogvlm
     # audio (mtmd)
-    A_ENC_EMBD_POS       = auto()
-    A_ENC_EMBD_NORM      = auto()
-    A_ENC_EMBD_TO_LOGITS = auto()
-    A_ENC_CONV1D         = auto()
-    A_ENC_CONV1D_NORM    = auto() # gemma3n
-    A_PRE_NORM           = auto()
-    A_POST_NORM          = auto()
-    A_ENC_LAYER_PRE_NORM = auto() # gemma3n
-    A_ENC_ATTN_Q         = auto()
-    A_ENC_ATTN_K         = auto()
-    A_ENC_ATTN_V         = auto()
-    A_ENC_PER_DIM_SCALE  = auto() # gemma3n
-    A_ENC_INPUT_NORM     = auto()
-    A_ENC_OUTPUT         = auto()
-    A_ENC_OUTPUT_NORM    = auto()
-    A_ENC_FFN_UP         = auto()
-    A_ENC_FFN_NORM       = auto()
-    A_ENC_FFN_POST_NORM  = auto() # gemma3n
-    A_ENC_FFN_SCALE      = auto() # gemma3n
-    A_ENC_FFN_GATE       = auto()
-    A_ENC_FFN_DOWN       = auto()
-    A_ENC_FFN_UP_1       = auto()
-    A_ENC_FFN_NORM_1     = auto()
+    A_ENC_EMBD_POS        = auto()
+    A_ENC_EMBD_NORM       = auto()
+    A_ENC_EMBD_TO_LOGITS  = auto() # lfm2
+    A_ENC_CONV1D          = auto()
+    A_ENC_CONV1D_NORM     = auto() # gemma3n
+    A_PRE_NORM            = auto()
+    A_POST_NORM           = auto()
+    A_ENC_LAYER_PRE_NORM  = auto() # gemma3n
+    A_ENC_ATTN_Q          = auto()
+    A_ENC_ATTN_K          = auto()
+    A_ENC_ATTN_V          = auto()
+    A_ENC_PER_DIM_SCALE   = auto() # gemma3n
+    A_ENC_INPUT_NORM      = auto()
+    A_ENC_OUTPUT          = auto()
+    A_ENC_OUTPUT_NORM     = auto()
+    A_ENC_FFN_UP          = auto()
+    A_ENC_FFN_NORM        = auto()
+    A_ENC_FFN_POST_NORM   = auto() # gemma3n
+    A_ENC_FFN_SCALE       = auto() # gemma3n
+    A_ENC_FFN_GATE        = auto()
+    A_ENC_FFN_DOWN        = auto()
+    A_ENC_FFN_UP_1        = auto() # lfm2, gemma3n
+    A_ENC_FFN_NORM_1      = auto() # lfm2, gemma3n (pre-norm)
     A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
-    A_ENC_FFN_SCALE_1    = auto() # gemma3n
-    A_ENC_FFN_GATE_1     = auto()
-    A_ENC_FFN_DOWN_1     = auto()
-    A_MMPROJ             = auto()
-    A_MMPROJ_FC          = auto()
-    A_MM_NORM_PRE        = auto()
-    A_MM_NORM_MID        = auto()
-    A_MM_EMBEDDING       = auto() # gemma3n
-    A_MM_HARD_EMB_NORM   = auto() # gemma3n
-    A_MM_SOFT_EMB_NORM   = auto() # gemma3n
-    A_MM_INP_PROJ        = auto() # gemma3n
+    A_ENC_FFN_SCALE_1     = auto() # gemma3n
+    A_ENC_FFN_GATE_1      = auto() # lfm2, gemma3n
+    A_ENC_FFN_DOWN_1      = auto() # lfm2, gemma3n
+    A_MMPROJ              = auto()
+    A_MMPROJ_FC           = auto()
+    A_MM_NORM_PRE         = auto()
+    A_MM_NORM_MID         = auto()
+    A_MM_EMBEDDING        = auto() # gemma3n
+    A_MM_HARD_EMB_NORM    = auto() # gemma3n
+    A_MM_SOFT_EMB_NORM    = auto() # gemma3n
+    A_MM_INP_PROJ         = auto() # gemma3n
     # nextn/mtp
     NEXTN_EH_PROJ        = auto()
     NEXTN_EMBED_TOKENS   = auto()