From 3e4c8f8faf497bdf2e02d40381fbc1a673516992 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Fri, 19 Dec 2025 20:07:14 +0000 Subject: [PATCH 01/20] Add Gemma3nVisionModel - MobileNetV5 vision encoder convertor to convert_hf_to_gguf.py. Add gemma3n to vision projectors in gguf-py/gguf/constants.py. --- convert_hf_to_gguf.py | 241 +++++++++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 2 + 2 files changed, 241 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 432be599469..36a7ed000af 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -520,7 +520,11 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: return () def prepare_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) + if self.tensor_map.mapping: + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + else: + max_name_len = len("vision_encoder.weight,") # Default reasonable length for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): # we don't need these @@ -5959,8 +5963,182 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel") +class Gemma3nVisionModel(MmprojModel): + """Vision encoder converter for Gemma3n using MobileNetV5 architecture""" + + # MobileNetV5 doesn't have transformer layers, so we don't need block count + # Set n_block_keys to empty list to skip the find_hparam check + n_block_keys = [] + + def find_hparam(self, keys: list[str], optional: bool = False) -> Any: + """Override to return 0 for block count since MobileNetV5 is CNN-based""" + if not keys: # If n_block_keys is empty (our case) + return 0 + # Otherwise use parent implementation + return super().find_hparam(keys, optional) + + def __init__(self, *args, **kwargs): + # Parent init will call find_hparam which now returns 0 for empty keys + super().__init__(*args, **kwargs) + + def find_vparam(self, keys: list[str], optional: bool = False) -> Any: + """Override to provide hardcoded MobileNetV5 parameters that aren't in config""" + # MobileNetV5 hardcodes these values in the architecture definition + # rather than storing them in config.json + + # Handle empty keys list (n_block_keys) - return 0 for CNN architecture + if not keys: + return 0 + + # Check if we're looking for image_size + if "image_size" in keys: + # MobileNetV5 300m_enc uses 768x768 input + return 768 + + # Check if we're looking for patch_size + if "patch_size" in keys: + # MobileNetV5 is CNN-based, doesn't use patches + # Set to 1 for compatibility + return 1 + + # Check if we're looking for intermediate_size + if "intermediate_size" in keys: + # MobileNetV5 uses expansion ratios in inverted residual blocks + # Typical expansion is 4x the embedding dimension + hidden_size = self.hparams_vision.get("hidden_size", 2048) + return hidden_size * 4 + + # Check if we're looking for num_attention_heads + if "num_attention_heads" in keys or "num_heads" in keys: + # MobileNetV5 uses Multi-Query Attention with 8 heads + return 8 + + # For other parameters, use parent implementation + return super().find_vparam(keys, optional) + + def set_gguf_parameters(self): + # MobileNetV5 requires ImageNet normalization values + # Override preprocessor_config to ensure correct values before calling super() + # IMAGENET_MEAN = [0.485, 0.456, 0.406] + # IMAGENET_STD = [0.229, 0.224, 0.225] + IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ] + IMAGENET_STD = [0.5 , 0.5 , 0.5 ] + + print("test") + + # Check if preprocessor_config has incorrect normalization values + if "image_mean" in self.preprocessor_config: + current_mean = self.preprocessor_config["image_mean"] + if current_mean != IMAGENET_MEAN: + logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}") + self.preprocessor_config["image_mean"] = IMAGENET_MEAN + print("test2") + else: + logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}") + self.preprocessor_config["image_mean"] = IMAGENET_MEAN + + if "image_std" in self.preprocessor_config: + current_std = self.preprocessor_config["image_std"] + if current_std != IMAGENET_STD: + logger.warning(f"Overriding image_std from {current_std} to ImageNet standard {IMAGENET_STD}") + self.preprocessor_config["image_std"] = IMAGENET_STD + else: + logger.info(f"Setting image_std to ImageNet standard {IMAGENET_STD}") + self.preprocessor_config["image_std"] = IMAGENET_STD + + # Now call parent which will use the corrected values + super().set_gguf_parameters() + hparams = self.hparams + + # Set projector type to GEMMA3N + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N) + + # MobileNetV5 specific parameters + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) # MobileNetV5 uses approximate GELU + + # Image sequence length (256 tokens = 16x16 for Gemma3n) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + # Note: Additional metadata can be added as needed + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Force quantization settings for specific tensor types + if "input_projection" in name or "input_proj" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name or "stem" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # Gemma3n uses different prefixes than other models: + # - model.embed_vision.* for projection layers + # - model.vision_tower.* for vision encoder + # Skip non-vision tensors + if not (name.startswith("model.embed_vision.") or + name.startswith("model.vision_tower.")): + return [] + + # Strip "model." prefix to match expected llama.cpp format + if name.startswith("model."): + name = name[6:] # Remove "model." prefix + + # Process MobileNetV5 and projection tensors + name = name.replace("_weight", ".weight") + + # Rename embed_vision to match our C++ implementation expectations + name = name.replace("embed_vision.", "") + + # Rename vision_tower.timm_model to vision_tower for cleaner naming + name = name.replace("vision_tower.timm_model.", "vision_tower.") + + # Handle normalization layer naming + name = name.replace("hard_embedding_norm", "hard_emb_norm") + name = name.replace("soft_embedding_norm", "soft_emb_norm") + # name = name.replace("embedding_post_projection_norm", "post_proj_norm") + + # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed + # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1 + if "soft_emb_norm.weight" in name: + # No correction needed for Gemma3n + pass + + return [(self.map_tensor_name(name), data_torch)] + + def map_tensor_name(self, name: str) -> str: + """Map Gemma3n tensor names to GGUF format""" + # Projector tensors (from embed_vision) - use mm. prefix like Gemma3 + # IMPORTANT: Keep the .weight suffix to match C++ expectations + if name == "embedding.weight": + return "mm.embedding.weight" + if name == "embedding_projection.weight": + return "mm.input_projection.weight" # Main projection used by C++ + if name == "hard_emb_norm.weight": + return "mm.hard_emb_norm.weight" # Hard embedding normalization + if name == "soft_emb_norm.weight": + return "mm.soft_emb_norm.weight" # Soft embedding normalization (used by C++) + if name == "post_proj_norm.weight": + return "mm.post_proj_norm.weight" # Post projection normalization (CRITICAL for Gemma3n) + + # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder + if name.startswith("vision_tower."): + # Remove vision_tower prefix and add v.enc. prefix + tensor_suffix = name[13:] # Remove "vision_tower." + return f"v.enc.{tensor_suffix}" + + # If no match, try parent implementation + try: + return super().map_tensor_name(name) + except ValueError: + # If parent also can't map it, provide a sensible default + # This shouldn't happen, but provides a fallback + logger.warning(f"Using fallback mapping for tensor: {name}") + return f"v.{name}" + -@ModelBase.register("Gemma3nForConditionalGeneration") +@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") class Gemma3NModel(Gemma3Model): model_arch = gguf.MODEL_ARCH.GEMMA3N norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code @@ -5983,8 +6161,43 @@ def __init__(self, *args, **kwargs): ] def set_vocab(self): + # For Gemma3n multimodal models, we need the FULL vocab_size (262400) + # which includes special tokens from 262144-262399 for vision/audio. + # The vocab_size_per_layer_input (262144) is only the embedding size per layer. + # Temporarily override the hparams lookup order to prioritize vocab_size. + + # Store original vocab_size_per_layer_input if it exists + vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input") + + # Temporarily remove vocab_size_per_layer_input to force using vocab_size + if vocab_size_per_layer_input is not None: + del self.hparams["vocab_size_per_layer_input"] + + # Call parent set_vocab which will now use vocab_size (262400) super().set_vocab() + # Restore vocab_size_per_layer_input for later use + if vocab_size_per_layer_input is not None: + self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input + + # Fix chat template for Gemma3n multimodal: replace special token placeholders with mtmd markers + # The mtmd library uses <__media__> as the default marker for images/audio + # but Gemma3n's chat template uses and + chat_template_key = "tokenizer.chat_template" + for kv_dict in self.gguf_writer.kv_data: + if chat_template_key in kv_dict: + template_value = kv_dict[chat_template_key].value + + # Replace soft token placeholders with mtmd markers + if '' in template_value or '' in template_value: + logger.info("Fixing Gemma3n chat template: replacing soft token placeholders with mtmd markers") + template_value = template_value.replace('', '<__media__>') + template_value = template_value.replace('', '<__media__>') + + # Update the value in place + kv_dict[chat_template_key].value = template_value + break + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) @@ -6020,8 +6233,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "language_model." not in name: return [] # skip non-language model tensors + # Pad token embeddings for vision/audio special tokens (262144-262399) + if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: + # Move to CPU to avoid meta device issues during padding + data_torch = data_torch.to(device="cpu") + + vocab_size = self.hparams.get("vocab_size", 262400) + current_size = data_torch.shape[0] # First dimension is vocab_size + + if current_size < vocab_size: + # Pad with zeros for vision/audio tokens (they get embeddings from vision tower) + padding_size = vocab_size - current_size + tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings" + logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)") + + # Create padding with zeros (vision tokens won't use these embeddings) + padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device) + data_torch = torch.cat([data_torch, padding], dim=0) + + # Continue with normal processing + name = name.replace("language_model.", "") + return [(self.map_tensor_name(name), data_torch)] + if "altup_unembed_projections" in name: data_torch = data_torch.to(device="cpu") + # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based + # They should NOT be padded if ".0." in name: self._altup_unembd[0] = data_torch elif ".1." in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cab8f2901ae..41654b22b5d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -456,6 +456,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): RESAMPLER = auto() GLM_EDGE = auto() MERGER = auto() + GEMMA3N = auto() GEMMA3 = auto() QWEN3VL = auto() COGVLM = auto() @@ -3397,6 +3398,7 @@ def get_type(val: Any) -> GGUFValueType: class VisionProjectorType: GEMMA3 = "gemma3" + GEMMA3N = "gemma3n" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" LLAMA4 = "llama4" From ad5ed98d7068f50447867f238c4cb1a9e1e29f3c Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sat, 20 Dec 2025 20:20:54 +0000 Subject: [PATCH 02/20] Add mobilenetv5 impl --- src/models/gemma3n-iswa.cpp | 55 +++- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-graph.h | 32 ++ tools/mtmd/clip-impl.h | 43 +++ tools/mtmd/clip-model.h | 56 ++++ tools/mtmd/clip.cpp | 521 ++++++++++++++++++++++++++++++ tools/mtmd/clip.h | 1 + tools/mtmd/models/mobilenetv5.cpp | 247 ++++++++++++++ tools/mtmd/models/models.h | 5 + tools/mtmd/mtmd.cpp | 5 +- 10 files changed, 963 insertions(+), 3 deletions(-) create mode 100644 tools/mtmd/models/mobilenetv5.cpp diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index a0bdd6a15a1..7a6a446eb20 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -259,7 +259,60 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); cb(inp_per_layer, "inp_per_layer_selected", -1); } else { - GGML_ABORT("TODO: support embd input"); + // For embedding inputs (e.g., from vision encoder) + // CRITICAL FIX: Vision tokens should use the padding token (ID=0) embedding + // from tok_embd_per_layer, NOT project the vision embeddings. + // The projection happens later in project_per_layer_inputs(). + // This matches PyTorch behavior: + // per_layer_inputs_tokens = torch.where(mask, input_ids, torch.zeros_like(input_ids)) + // per_layer_inputs = EmbedPerLayer(per_layer_inputs_tokens) # Uses padding (0) for vision + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp->embd); + + // For vision, we need per_layer_inputs from padding token (ID=0) + // We CANNOT use inp->tokens because batch allows EITHER tokens OR embeddings + // + // The challenge: We need to broadcast padding token embedding from [embd_size, 1] to [embd_size, n_tokens] + // but ggml_repeat+ggml_dup doesn't work in no_alloc mode (creates views without backing memory). + // + // Solution: Use ggml_add to broadcast! GGML automatically broadcasts along compatible dimensions. + // We create zeros of shape [embd_size, n_tokens], then add padding_emb [embd_size, 1] which broadcasts. + + // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer + const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer + + // Create zeros tensor [embd_size, n_tokens] by projecting vision embeddings and multiplying by 0 + // First, project inp->embd [n_embd, n_tokens] to per-layer space [embd_size, n_tokens] + ggml_tensor * zeros_per_layer = ggml_mul_mat(ctx0, model.per_layer_model_proj, inp->embd); + zeros_per_layer = ggml_scale(ctx0, zeros_per_layer, 0.0f); // Multiply by 0 to get zeros + ggml_set_name(zeros_per_layer, "zeros_per_layer"); + + // Extract column 0 (padding token's embedding) as a vector: [embd_size] + // Note: tok_embd_per_layer is quantized (q8_0), so the view is also q8_0 + ggml_tensor * padding_embd_vec_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, + embd_size, // number of elements + 0); // offset (column 0) + ggml_set_name(padding_embd_vec_q, "padding_token_emb_q8"); + + // Dequantize to f32 using ggml_cpy + ggml_tensor * padding_embd_vec_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size); + ggml_tensor * padding_embd_vec = ggml_cpy(ctx0, padding_embd_vec_q, padding_embd_vec_f32); + ggml_set_name(padding_embd_vec, "padding_token_emb_f32"); + + // Reshape to [embd_size, 1] for broadcasting + ggml_tensor * padding_embd_col = ggml_reshape_2d(ctx0, padding_embd_vec, embd_size, 1); + + // Add: zeros [embd_size, n_tokens] + padding [embd_size, 1] = broadcasted padding [embd_size, n_tokens] + ggml_tensor * inp_per_layer_flat = ggml_add(ctx0, zeros_per_layer, padding_embd_col); + ggml_set_name(inp_per_layer_flat, "inp_per_layer_broadcasted"); + + // Reshape to [n_embd_altup, n_layer, n_tokens] for per-layer processing + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer_flat, n_embd_altup, n_layer, n_tokens); + + // Apply same scaling as text tokens + // inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); + cb(inp_per_layer, "inp_per_layer_vision", -1); } res->add_input(std::move(inp)); return inp_per_layer; diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 317d5f19fd9..a74b4bc2154 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -27,6 +27,7 @@ add_library(mtmd models/qwen3vl.cpp models/siglip.cpp models/whisper-enc.cpp + models/mobilenetv5.cpp ) set_target_properties(mtmd PROPERTIES diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 2b1915779f2..5d8c46862bd 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -70,6 +70,38 @@ struct clip_graph { ggml_tensor * build_inp_raw(int channels = 3); + ggml_tensor * rms_norm_2d( + ggml_tensor * inp, + ggml_tensor * weight, + float eps = 1e-6f, + int block_idx=-1); + + ggml_tensor* pad_same_2d( + ggml_tensor* inp, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int dilation_h = 1, + int dilation_w = 1); + + ggml_tensor * build_edge_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride, + int block_idx = -1); + + ggml_tensor * build_inverted_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride, + int block_idx = -1); + + ggml_tensor * build_mobilenet_attn( + ggml_tensor * inp, + const mobilenetv5_block & block, + int block_idx = -1); + ggml_tensor * build_norm( ggml_tensor * cur, ggml_tensor * mw, diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a0939865e3f..24a1ef52d08 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -153,6 +153,47 @@ #define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" #define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" +// mobilenetv5 (gemma3n) definitions +#define TN_MNV5_STEM_CONV "v.enc.conv_stem.conv.weight" +#define TN_MNV5_STEM_BIAS "v.enc.conv_stem.conv.bias" +#define TN_MNV5_STEM_BN "v.enc.conv_stem.bn.weight" + +// Stage 0 Block (Edge Residual) +#define TN_MNV5_BLK_S0_EXP_W "v.enc.blocks.%d.%d.conv_exp.weight" +#define TN_MNV5_BLK_S0_BN1_W "v.enc.blocks.%d.%d.bn1.weight" +#define TN_MNV5_BLK_S0_PWL_W "v.enc.blocks.%d.%d.conv_pwl.weight" +#define TN_MNV5_BLK_S0_BN2_W "v.enc.blocks.%d.%d.bn2.weight" + +// Stage 1+ Block (Universal Inverted Residual) +#define TN_MNV5_BLK_DW_START_W "v.enc.blocks.%d.%d.dw_start.conv.weight" +#define TN_MNV5_BLK_DW_START_BN "v.enc.blocks.%d.%d.dw_start.bn.weight" +#define TN_MNV5_BLK_DW_MID_W "v.enc.blocks.%d.%d.dw_mid.conv.weight" +#define TN_MNV5_BLK_DW_MID_BN "v.enc.blocks.%d.%d.dw_mid.bn.weight" +#define TN_MNV5_BLK_PW_EXP_W "v.enc.blocks.%d.%d.pw_exp.conv.weight" +#define TN_MNV5_BLK_PW_EXP_BN "v.enc.blocks.%d.%d.pw_exp.bn.weight" +#define TN_MNV5_BLK_PW_PROJ_W "v.enc.blocks.%d.%d.pw_proj.conv.weight" +#define TN_MNV5_BLK_PW_PROJ_BN "v.enc.blocks.%d.%d.pw_proj.bn.weight" +#define TN_MNV5_BLK_LAYER_SCALE "v.enc.blocks.%d.%d.layer_scale.gamma" + +// Attention Components +#define TN_MNV5_ATTN_Q_W "v.enc.blocks.%d.%d.attn.query.proj.weight" +#define TN_MNV5_ATTN_K_W "v.enc.blocks.%d.%d.attn.key.proj.weight" +#define TN_MNV5_ATTN_V_W "v.enc.blocks.%d.%d.attn.value.proj.weight" +#define TN_MNV5_ATTN_O_W "v.enc.blocks.%d.%d.attn.output.proj.weight" +#define TN_MNV5_ATTN_K_DW "v.enc.blocks.%d.%d.attn.key.down_conv.weight" +#define TN_MNV5_ATTN_K_NORM "v.enc.blocks.%d.%d.attn.key.norm.weight" +#define TN_MNV5_ATTN_V_DW "v.enc.blocks.%d.%d.attn.value.down_conv.weight" +#define TN_MNV5_ATTN_V_NORM "v.enc.blocks.%d.%d.attn.value.norm.weight" +#define TN_MNV5_ATTN_NORM "v.enc.blocks.%d.%d.norm.weight" // Block norm used in attn blocks + +// MSFA +#define TN_MNV5_MSFA_FFN_EXP_W "v.enc.msfa.ffn.pw_exp.conv.weight" +#define TN_MNV5_MSFA_FFN_EXP_BN "v.enc.msfa.ffn.pw_exp.bn.weight" +#define TN_MNV5_MSFA_FFN_PROJ_W "v.enc.msfa.ffn.pw_proj.conv.weight" +#define TN_MNV5_MSFA_FFN_PROJ_BN "v.enc.msfa.ffn.pw_proj.bn.weight" +#define TN_MNV5_MSFA_NORM "v.enc.msfa.norm.weight" + + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -170,6 +211,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN3VL, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_GEMMA3N, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, PROJECTOR_TYPE_QWEN25VL, @@ -200,6 +242,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_GEMMA3N, "gemma3n"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index b4c31cdde6b..e03f455b1b5 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -172,6 +172,45 @@ struct clip_layer { } }; +// Expanded MobileNetV5 block structure for Gemma3n vision encoder +struct mobilenetv5_block { + // Stage 0 (Edge Residual) + ggml_tensor * s0_conv_exp_w = nullptr; + ggml_tensor * s0_bn1_w = nullptr; + ggml_tensor * s0_conv_pwl_w = nullptr; + ggml_tensor * s0_bn2_w = nullptr; + + // Stage 1+ (Universal Inverted Residual) + ggml_tensor * dw_start_w = nullptr; + ggml_tensor * dw_start_bn_w = nullptr; + + ggml_tensor * pw_exp_w = nullptr; + ggml_tensor * pw_exp_bn_w = nullptr; + + ggml_tensor * dw_mid_w = nullptr; + ggml_tensor * dw_mid_bn_w = nullptr; + + ggml_tensor * pw_proj_w = nullptr; + ggml_tensor * pw_proj_bn_w = nullptr; + + ggml_tensor * layer_scale_w = nullptr; + + // Attention (MQA) components + ggml_tensor * attn_q_w = nullptr; + ggml_tensor * attn_k_w = nullptr; + ggml_tensor * attn_v_w = nullptr; + ggml_tensor * attn_o_w = nullptr; + + // Optional downsampling/norm in attention + ggml_tensor * attn_k_dw_w = nullptr; + ggml_tensor * attn_k_norm_w = nullptr; + ggml_tensor * attn_v_dw_w = nullptr; + ggml_tensor * attn_v_norm_w = nullptr; + + // Block norm (often present in attention blocks) + ggml_tensor * attn_norm_w = nullptr; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -288,6 +327,23 @@ struct clip_model { ggml_tensor * mm_input_proj_w = nullptr; ggml_tensor * mm_soft_emb_norm_w = nullptr; + // mobilenetv5 for gemma3n + std::vector mobilenet_blocks; + std::vector mobilenet_stage_ends; // NEW: Track end indices of stages + ggml_tensor * mobilenet_stem_conv_w = nullptr; + ggml_tensor * mobilenet_stem_conv_b = nullptr; + ggml_tensor * mobilenet_stem_norm_w = nullptr; + ggml_tensor * mm_post_proj_norm_w = nullptr; + + // Multi-Scale Fusion Adapter (MSFA) components + ggml_tensor * msfa_concat_conv_w = nullptr; // Concatenated feature processing + ggml_tensor * msfa_concat_norm_w = nullptr; + ggml_tensor * msfa_ffn_expand_w = nullptr; // FFN expansion + ggml_tensor * msfa_ffn_project_w = nullptr; // FFN projection + ggml_tensor * msfa_ffn_expand_bn = nullptr; // NEW: FFN expansion batch norm + ggml_tensor * msfa_ffn_project_bn = nullptr; // NEW: FFN projection batch norm + + // pixtral, glm4v ggml_tensor * token_embd_img_break = nullptr; ggml_tensor * mm_patch_merger_w = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3ba0823defb..4c357aab19e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -263,6 +263,378 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { } } +// Helper: Normalize over the Channel dimension (dim 2 in [W, H, C, B]) +// RMS Norm 2D - normalizes over channels for each spatial position +// PyTorch: v = torch.mean(x.pow(2), dim=1) - mean over C for each (N,H,W) +// We need to normalize each spatial position across its C channels +ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps, int block_idx) { + // inp: [W, H, C, B] + const int64_t W = inp->ne[0]; + const int64_t H = inp->ne[1]; + const int64_t C = inp->ne[2]; + const int64_t B = inp->ne[3]; + + // Step 1: Permute [W, H, C, B] -> [C, W, H, B] + // Puts Channels in ne[0] (contiguous) + ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + + // Step 2: Reshape [C, W, H, B] -> [C, W*H*B] + // We now have a 2D matrix where columns are Channels (ne[0]) + // and rows are Spatial/Batch (ne[1]). + // cur = ggml_reshape_2d(ctx0, cur, C, W * H * B); + + // REMOVED Step 3 (Transpose). + // We WANT ne[0] to be C so rms_norm reduces over it. + + // Step 4: Apply RMS Norm + // Normalizes ne[0] (C) for every element in ne[1] (Spatial/Batch). + cur = ggml_rms_norm(ctx0, cur, eps); + + // Step 5: Apply weight if present + if (weight) { + // weight is [C] + // cur is [C, W*H*B] + // ggml_mul broadcasts automatically along higher dims. + // It multiplies element i of weight with element i of cur's ne[0]. + cur = ggml_mul(ctx0, cur, weight); + } + + // REMOVED Step 6 (Transpose back). We never transposed. + + // Step 7: Reshape back to [C, W, H, B] + // cur = ggml_reshape_4d(ctx0, cur, C, W, H, B); + + // Step 8: Permute back to [W, H, C, B] + // ne[0]=C, ne[1]=W, ne[2]=H, ne[3]=B + // We want new ne[0] to be old ne[1] (W) + // We want new ne[1] to be old ne[2] (H) + // We want new ne[2] to be old ne[0] (C) + // We want new ne[3] to be old ne[3] (B) + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); + + // cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // Note: The second permute in your original code was likely redundant/incorrect + // after the first one. A single permute is sufficient to restore order. + cur = ggml_cont(ctx0, cur); + + return cur; +} + + +// ------------------------------------------------------------------------ +// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF) +// ------------------------------------------------------------------------ +ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { + const int64_t ih = inp->ne[1]; // height + const int64_t iw = inp->ne[0]; // width + + // Calculate output size (ceil division) + const int64_t oh = (ih + stride_h - 1) / stride_h; + const int64_t ow = (iw + stride_w - 1) / stride_w; + + // Calculate padding needed + const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih); + const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw); + + // Split padding asymmetrically + const int pad_h_top = pad_h / 2; + const int pad_h_bottom = pad_h - pad_h_top; + const int pad_w_left = pad_w / 2; + const int pad_w_right = pad_w - pad_w_left; + + // Apply padding if needed + // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch + if (pad_h > 0 || pad_w > 0) { + inp = ggml_pad_ext(ctx0, inp, + pad_w_left, pad_w_right, // width padding (dim 0) + pad_h_top, pad_h_bottom, // height padding (dim 1) + 0, 0, // no channel padding (dim 2) + 0, 0); // no batch padding (dim 3) + } + + return inp; +} + +// ------------------------------------------------------------------------ +// Edge Residual Block (Stage 0) - CORRECTED +// ------------------------------------------------------------------------ +ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) { + ggml_tensor * cur = inp; + + // 1. Expansion Conv (3x3) + // -------------------------------------------------------------------- + // LOGIC FIX: + // Block 0 (stride=2): Uses "Conv2dSame". We must manually pad, then conv with pad=0. + // Block 1,2 (stride=1): Uses standard "Conv2d" with padding=(1,1). + // -------------------------------------------------------------------- + + if (stride == 2) { + // Case: Downsampling (Block 0) + // Replicates Conv2dSame(kernel=3, stride=2) + // We calculate asymmetric padding dynamically + cur = pad_same_2d(cur, 3, 3, stride, stride); + + // Perform conv with 0 padding because we just applied it manually + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); + } else { + // Case: Normal 3x3 Block (Block 1, 2) + // Replicates Conv2d(kernel=3, stride=1, padding=1) + // Standard symmetric padding of 1 is sufficient for 3x3 s1 to keep dims same + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); + } + + // BN + Activation + if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w); + cur = ggml_gelu(ctx0, cur); + + // 2. Pointwise Linear Conv (1x1) + // 1x1 Convs usually have padding=0 and stride=1 + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1); + if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w); + + // 3. Residual Connection + // Only apply residual if spatial dimensions and channels match (stride 1) + if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) { + ggml_tensor * cur = inp; + + // 1. Depthwise Start (Optional) + // NOTE: dw_start always has stride=1 (no downsampling here) + if (block.dw_start_w) { + int k = block.dw_start_w->ne[0]; // 3 or 5 + int p = k / 2; + // cur = ggml_conv_2d_dw_direct(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); + cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); + if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); + } + + // 2. Pointwise Expansion (1x1) + if (block.pw_exp_w) { + // Standard 1x1 conv, pad=0, stride=1 + cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 3. Depthwise Mid (Optional) + // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) + if (block.dw_mid_w) { + int k = block.dw_mid_w->ne[0]; // 3 or 5 + + if (stride > 1) { + // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding + cur = pad_same_2d(cur, k, k, stride, stride); + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0 + } else { + // Case: Stride 1 -> Use Standard Symmetric Padding + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1); + } + + if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 4. Pointwise Projection (1x1) + if (block.pw_proj_w) { + cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w); + } + + // Apply Layer Scaling if present + if (block.layer_scale_w) { + ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, + 1, 1, block.layer_scale_w->ne[0], 1); + + cur = ggml_mul(ctx0, cur, scale_w_reshaped); + } + + // 5. Residual Connection + bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); + bool same_channel = (inp->ne[2] == cur->ne[2]); + if (same_spatial && same_channel) { + // --- FIXED LAYER SCALING --- + // --------------------------- + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// MobileNetV5 Builder (Gemma 3n) - Attention Block +ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block, int block_idx) { + + // ... [Debug Helpers kept same as original] ... + // auto DEBUG_SHAPE = [&](const char* label, ggml_tensor* t) { /* ... */ }; + // auto REGISTER_DEBUG = [&](const std::string& name, ggml_tensor* t) { /* ... */ }; + + // // Debug input + // if (block_idx == 33 || block_idx == 50 || block_idx == 52) { + // char debug_name[128]; + // snprintf(debug_name, sizeof(debug_name), "block%d_input", block_idx); + // REGISTER_DEBUG(debug_name, inp); + // } + + ggml_tensor * cur = inp; + + // --- Norm --- + if (block.attn_norm_w) { + cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f, block_idx); + } + + // --- 1. Q Calculation --- + ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); + + // --- 2. K Calculation (Downsampled) --- + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * k_inp = cur; + if (block.attn_k_dw_w) { + int k_size = block.attn_k_dw_w->ne[0]; // Usually 3 + k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding + k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_k_norm_w) { + k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f, block_idx); + } + } + ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); + + // --- 3. V Calculation (Downsampled) --- + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * v_inp = cur; + if (block.attn_v_dw_w) { + int v_size = block.attn_v_dw_w->ne[0]; // Usually 3 + v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding + v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_v_norm_w) { + v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f, block_idx); + } + } + ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); + + // --- Reshape & Permute Logic --- + + const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; + const int D = k->ne[2]; // Head dimension + const int n_head = q->ne[2] / D; + const int N = W * H; + + // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B] + q = ggml_reshape_3d(ctx0, q, N, D*n_head, B); + q = ggml_reshape_4d(ctx0, q, N, D, n_head, B); + q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B] + q = ggml_cont(ctx0, q); + + const int Wk = k->ne[0]; const int Hk = k->ne[1]; + const int M = Wk * Hk; + + // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] + k = ggml_reshape_3d(ctx0, k, M, D, B); + k = ggml_reshape_4d(ctx0, k, M, D, 1, B); + k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B] + k = ggml_cont(ctx0, k); + + // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] + // NOTE: We keep V as [M, D] because ggml_mul_mat expects src0^T * src1. + // To get output [D, N], we will need [M, D]^T * [M, N]. + v = ggml_reshape_3d(ctx0, v, M, D, B); + v = ggml_reshape_4d(ctx0, v, M, D, 1, B); + v = ggml_cont(ctx0, v); // [M, D, 1, B] + + // --- Multi-Query Attention --- + float scale = 1.0f / sqrtf((float)D); + + // Step 1: Compute Q @ K.T + // Q: [D, N, n_head, B] + // K: [D, M, 1, B] + // ggml_mul_mat computes K^T * Q -> [D, M]^T * [D, N] -> [M, D] * [D, N] -> [M, N] + // Implicit Broadcast: K has 1 head, Q has n_head. ggml handles this automatically. + ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); // Result: [M, N, n_head, B] (in ggml layout) + + // // Debug scores + // if (block_idx == 33) { + // char debug_name[128]; + // snprintf(debug_name, sizeof(debug_name), "block%d_scores_raw", block_idx); + // REGISTER_DEBUG(debug_name, scores); + // } + + scores = ggml_scale(ctx0, scores, scale); + + // Step 2: Softmax + // scores is [M, N, n_head, B] (ne0=M, ne1=N) + // We need softmax over M (keys). + // ggml_soft_max applies to dim 0, which is M. Perfect - no permute needed! + scores = ggml_soft_max(ctx0, scores); + + // Step 3: Compute Attn @ V + // V: [M, D, 1, B] (ne0=M, ne1=D) + // Scores: [M, N, n_head, B] (ne0=M, ne1=N) + // + // ggml_mul_mat computes V^T * Scores -> [M, D]^T * [M, N] -> [D, M] * [M, N] -> [D, N] + // Implicit Broadcast: V has 1 head, Scores has n_head. ggml handles this automatically. + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); // Result: [N, D, n_head, B] + + // // Debug kqv + // if (block_idx == 33) { + // char debug_name[128]; + // snprintf(debug_name, sizeof(debug_name), "block%d_kqv_out", block_idx); + // REGISTER_DEBUG(debug_name, kqv); + // } + + // --- Reshape back to spatial layout --- + // kqv is [N, D, n_head, B]. We want [D, N, n_head, B] to merge heads. + kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); // [D, N, n_head, B] + kqv = ggml_cont(ctx0, kqv); + + // Reshape to [N, D*n_head, B] then [W, H, C, B] + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); + kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); + kqv = ggml_cont(ctx0, kqv); + +// Output projection + cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); + + // --- Residual & Layer Scale (FIXED) --- + if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { + if (block.layer_scale_w) { + // FIX: Simplified Layer Scale. No permute needed. + // Tensor is [W, H, C, B]. Weight is [C]. + // We reshape Weight to [1, 1, C, 1]. + // GGML will broadcast W and H dimensions automatically. + + // Debug print shape of block.layer_scale_w + // fprintf(stderr, "DEBUG: block %d layer_scale_w shape: [%ld x %ld x %ld x %ld]\n", block_idx, block.layer_scale_w->ne[0], block.layer_scale_w->ne[1], block.layer_scale_w->ne[2], block.layer_scale_w->ne[3]); + + // Debug print shape of cur before scaling + // fprintf(stderr, "DEBUG: block %d cur shape before scaling: [%ld x %ld x %ld x %ld]\n", block_idx, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); + + + ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, + 1, 1, block.layer_scale_w->ne[0], 1); + + // Debug print shape of scale_w_reshaped + // fprintf(stderr, "DEBUG: block %d scale_w_reshaped shape: [%ld x %ld x %ld x %ld]\n", block_idx, scale_w_reshaped->ne[0], scale_w_reshaped->ne[1], scale_w_reshaped->ne[2], scale_w_reshaped->ne[3]); + + cur = ggml_mul(ctx0, cur, scale_w_reshaped); + } + + // Residual Addition + // 'cur' is the pointer to the graph node of the attention output. + // 'inp' is the pointer to the graph node of the block input. + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + // siglip2 naflex ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; @@ -788,6 +1160,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GEMMA3N: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { @@ -1141,6 +1517,14 @@ struct clip_model_loader { // test model (tinygemma3) has a different value, we optionally read it get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; + + case PROJECTOR_TYPE_GEMMA3N: + { + // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16) + // Similar configuration to Gemma3 + hparams.n_merge = 1; // MobileNetV5 handles resizing internally + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: @@ -1381,6 +1765,7 @@ struct clip_model_loader { } } + switch (model.proj_type) { case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: @@ -1512,6 +1897,106 @@ struct clip_model_loader { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_GEMMA3N: + { + model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); + model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false); + model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false); + + model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false); + model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded + model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false); + model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false); + + // IMPORTANT: Your GGUF log shows 'v.enc.msfa.norm.weight' -> shape {2048} + // Ensure TN_MNV5_MSFA_NORM matches this string + model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false); + + // Dynamically load blocks stage by stage + for (int stage = 0; stage < 4; ++stage) { + int blocks_found_in_stage = 0; + + for (int blk_idx = 0; ; ++blk_idx) { + bool found_block = false; + mobilenetv5_block block; + + // 1. Check for Edge Residual (S0) + block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false); + if (block.s0_conv_exp_w) { + found_block = true; + block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false); + block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false); + block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false); + } + // 2. Check for UIR (Universal Inverted Residual) + else { + // Check for dw_start OR pw_exp (some UIR blocks skip dw_start) + block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false); + block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false); + + if (block.dw_start_w || block.pw_exp_w) { + found_block = true; + if (block.dw_start_w) { + block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false); + } + if (block.pw_exp_w) { + block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false); + } + block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false); + if (block.dw_mid_w) { + block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false); + } + block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false); + if (block.pw_proj_w) { + block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false); + } + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + // 3. Check for Attention (MQA) + // Even if UIR/Edge check failed, this might be a pure attention block + ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false); + if (attn_q_check) { + found_block = true; + block.attn_q_w = attn_q_check; + block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false); + block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false); + block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false); + block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false); + block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false); + block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false); + block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false); + block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false); + // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check + if (!block.layer_scale_w) { + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + if (found_block) { + model.mobilenet_blocks.push_back(block); + blocks_found_in_stage++; + } else { + // End of blocks for this stage + break; + } + } + + // Track where this stage ends in the flat vector + if (blocks_found_in_stage > 0) { + model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1); + LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1); + } + } + // Load projection weights (similar to Gemma3) + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + // model.mm_post_proj_norm_w = get_tensor(TN_MM_POST_PROJ_N); // CRITICAL: Post projection norm + // Load additional Gemma3n projection tensors + model.mm_0_w = get_tensor("mm.embedding.weight", false); // Input embedding + model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false); // Hard embedding norm + } break; case PROJECTOR_TYPE_IDEFICS3: { model.projection = get_tensor(TN_MM_PROJECTOR); @@ -2052,6 +2537,18 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } +// Rescale image from u8 to f32 without normalization (for models like GEMMA3N that use SiglipImageProcessorFast) +// This only converts from [0, 255] to [0.0, 1.0] range without applying mean/std normalization +static void rescale_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(src.buf[i]) / 255.0f; + } +} + // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { dst.nx = src.nx; @@ -2747,6 +3244,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); } break; + case PROJECTOR_TYPE_GEMMA3N: + { + // GEMMA3N uses SiglipImageProcessorFast which only rescales to [0.0, 1.0] without normalization + // Resize to 768x768 using bilinear interpolation, then rescale to f32 + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + rescale_image_u8_to_f32(resized_image, *img_f32); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + case PROJECTOR_TYPE_JANUS_PRO: { // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 @@ -3006,6 +3515,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int scale_factor = ctx->model.hparams.n_merge; n_patches /= (scale_factor * scale_factor); } break; + case PROJECTOR_TYPE_GEMMA3N: + { + // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution + // regardless of input size (see architecture description) + n_patches = 16 * 16; // 256 tokens + } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { @@ -3396,6 +3911,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("patches", patches); } break; case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3N: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_QWEN2A: @@ -3521,6 +4037,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { // main path + deepstack paths return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3N: return ctx->model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_IDEFICS3: return ctx->model.projection->ne[1]; @@ -3575,6 +4092,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; } +bool clip_is_gemma3n(const struct clip_ctx * ctx) { + return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3N; +} + bool clip_has_vision_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_VISION; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 68a0d6e857e..c244df2677f 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -107,6 +107,7 @@ bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); +bool clip_is_gemma3n(const struct clip_ctx * ctx); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp new file mode 100644 index 00000000000..9946ca6afa8 --- /dev/null +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -0,0 +1,247 @@ +#include "models.h" + +ggml_cgraph * clip_graph_mobilenetv5::build() { + + fprintf(stderr, "\n--- START build_mobilenetv5 ---\n"); + + ggml_tensor * inp = build_inp_raw(); + + // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2)) + ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding + + // ggml_tensor * mobilenet_stem_conv_w_fixed = fix_1x1_weight(model.mobilenet_stem_conv_w); + + cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0 + if (model.mobilenet_stem_conv_b) { + // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B] + ggml_tensor * bias = ggml_reshape_4d(ctx0, model.mobilenet_stem_conv_b, 1, 1, cur->ne[2], 1); + cur = ggml_add(ctx0, cur, bias); + } + if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w); + cur = ggml_gelu(ctx0, cur); + + + // 2. Blocks + std::vector intermediate_features; + const int total_blocks = model.mobilenet_blocks.size(); + + auto is_stage_start = [&](int i) { + if (i == 0) return true; + for (int end_idx : model.mobilenet_stage_ends) { + if (i == end_idx + 1) return true; + } + return false; + }; + + auto is_fusion_point = [&](int i) { + if (model.mobilenet_stage_ends.size() >= 4) { + if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2 + if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3 + } else { + if (i == total_blocks - 1) return true; + } + return false; + }; + + for (int i = 0; i < total_blocks; i++) { + const auto & block = model.mobilenet_blocks[i]; + int stride = is_stage_start(i) ? 2 : 1; + + // Debug block type + const char* block_type = block.s0_conv_exp_w ? "edge_residual" : + block.attn_q_w ? "attention" : "inverted_residual"; + + // // Debug input for problematic blocks + // if (i >= 50 && i <= 54) { + // fprintf(stderr, "DEBUG: Block %d (%s) input shape: [%ld, %ld, %ld, %ld], stride=%d\n", + // i, block_type, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], stride); + // } + + if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride, i); + else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block, i); + else cur = build_inverted_residual(cur, block, stride, i); + + // Register block output for debugging + char block_name[64]; + + if (is_fusion_point(i)) { + + intermediate_features.push_back(cur); + } + } + + // 3. Multi-Scale Fusion Adapter (MSFA) - REPLICATED & FIXED + if (!intermediate_features.empty()) { + + // A. Reference Resolution: PyTorch implementation uses inputs[0] + // We assume intermediate_features[0] is the "High Resolution" target. + // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32). + ggml_tensor* target_feat = intermediate_features[0]; + int high_res_w = target_feat->ne[0]; + int high_res_h = target_feat->ne[1]; + + std::vector resized_feats; + + // B. Resize inputs to match inputs[0] (High Resolution) + for (auto feat : intermediate_features) { + int feat_w = feat->ne[0]; + int feat_h = feat->ne[1]; + + // PyTorch: if feat_size < high_resolution: interpolate + if (feat_w < high_res_w || feat_h < high_res_h) { + // Calculate scale factor. + // Note: PyTorch 'nearest' works on arbitrary float scales. + // ggml_upscale generally takes integer factors or target sizes depending on helper. + // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2). + int scale_w = high_res_w / feat_w; + int scale_h = high_res_h / feat_h; + + // Safety check for non-integer scaling if strictly replicating + if (high_res_w % feat_w != 0) { + fprintf(stderr, "Warning: Non-integer scaling detected in MSFA\n"); + } + + // Upsample (Nearest Neighbor) + // 2 is the scale factor + feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); + } + resized_feats.push_back(feat); + } + + // C. Concatenate at High Resolution (Channel Dim = 2 in ggml) + cur = resized_feats[0]; + for (size_t k = 1; k < resized_feats.size(); ++k) { + cur = ggml_concat(ctx0, cur, resized_feats[k], 2); + } + + // D. FFN (UniversalInvertedResidual) + // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm + + // 1. Expansion + if (model.msfa_ffn_expand_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1); + + // MISSING IN YOUR CODE: Expansion Norm + if (model.msfa_ffn_expand_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); // Helper to apply RMSNorm + } + + cur = ggml_gelu(ctx0, cur); + + } + + // 2. Projection (No DW because kernel_size=0) + if (model.msfa_ffn_project_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1); + + // MISSING IN YOUR CODE: Projection Norm + // UniversalInvertedResidual typically has a norm after projection + if (model.msfa_ffn_project_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_project_bn); + } + + } + + // E. Final Downsample to Target Resolution (Output Resolution) + // PyTorch: matches self.output_resolution (e.g. 16x16) + const int target_out_res = 16; + int current_w = cur->ne[0]; + + if (current_w > target_out_res) { + int s = current_w / target_out_res; + + // PyTorch Logic: + // if divisible: avg_pool + // if not divisible: bilinear interpolate (hard to do in pure ggml, usually assumed divisible here) + + if (current_w % target_out_res == 0) { + // Avg Pool: Kernel=s, Stride=s + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); + } else { + // Fallback or Error: ggml doesn't easily support bilinear downsampling + // without custom ops, but standard models usually stick to integer strides. + fprintf(stderr, "Error: Irregular downsampling stride required.\n"); + } + + } + + // F. Final Norm + if (model.msfa_concat_norm_w) { + cur = rms_norm_2d(cur, model.msfa_concat_norm_w); + + } + } + + // 4. Gemma 3n Multimodal Projection (Embedder) - FULL FIX + // Input: 'cur' is [Width, Height, Channels, Batch] + int W = cur->ne[0]; + int H = cur->ne[1]; + int C = cur->ne[2]; // Should be 2048 + int B = cur->ne[3]; + + // 1. Permute and Flatten to [Channels, Tokens, Batch] + // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch) + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // -> [C, W, H, B] + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_3d(ctx0, cur, C, W*H, B); + cur = ggml_cont(ctx0, cur); + + + // 2. FEATURE SCALING (Missing in your original code) + // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5 + // This prevents the signal from vanishing during the subsequent RMSNorm. + const float scale_factor = sqrtf((float)C); + cur = ggml_scale(ctx0, cur, scale_factor); + + + // 3. SOFT EMBEDDING NORM + // PyTorch: self._norm(x) * self.weight + // We must normalize regardless, then multiply if weight exists. + { + const float eps = 1e-6f; // Gemma3n uses 1e-6 + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_soft_emb_norm_w) { + // Weight shape is (2048,) -> Element-wise broadcast multiply + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + } + + } + + // 4. PROJECTION + // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) + // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] + // Need to transpose for ggml_mul_mat which computes A^T * B + // This matches Gemma3's projection at line ~1319 which also transposes + if (model.mm_input_proj_w) { + // cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } + + // 5. POST PROJECTION NORM + // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False) + // with_scale=False means weight is registered as buffer with value 1.0 + // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1 + // NOTE: Vision embeddings intentionally have magnitude ~1, different from + // text embeddings at ~sqrt(n_embd). The model was trained with this mismatch. + { + const float eps = 1e-6f; + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_post_proj_norm_w) { + // If weight is loaded, multiply (should be ~1.0 anyway) + cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w); + } + } + + + // cur = ggml_scale(ctx0, cur, scale_factor); + + ggml_build_forward_expand(gf, cur); + return gf; +} \ No newline at end of file diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 8d6d4ef67be..3875285fe92 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -66,3 +66,8 @@ struct clip_graph_glm4v : clip_graph { clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; + +struct clip_graph_mobilenetv5 : clip_graph { + clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b9c4fa90980..2d970cf45c2 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -266,7 +266,7 @@ struct mtmd_context { } // set boi/eoi - if (proj == PROJECTOR_TYPE_GEMMA3) { + if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3N) { // ... (image embeddings) ... img_beg = ""; img_end = ""; @@ -858,7 +858,8 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { } bool mtmd_decode_use_non_causal(mtmd_context * ctx) { - if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) { + if (ctx->ctx_v && + (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) { return true; } return false; From f57705478749497f561d793e8fb7b2e0a2712b8f Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sat, 20 Dec 2025 20:46:21 +0000 Subject: [PATCH 03/20] Fix comments, remove unused vars --- src/models/gemma3n-iswa.cpp | 11 +-- tools/mtmd/clip-graph.h | 12 +-- tools/mtmd/clip-model.h | 12 +-- tools/mtmd/clip.cpp | 157 ++++-------------------------- tools/mtmd/models/mobilenetv5.cpp | 44 ++------- 5 files changed, 36 insertions(+), 200 deletions(-) diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index 7a6a446eb20..e172b9a79f8 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -260,7 +260,7 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { cb(inp_per_layer, "inp_per_layer_selected", -1); } else { // For embedding inputs (e.g., from vision encoder) - // CRITICAL FIX: Vision tokens should use the padding token (ID=0) embedding + // Vision tokens should use the padding token (ID=0) embedding // from tok_embd_per_layer, NOT project the vision embeddings. // The projection happens later in project_per_layer_inputs(). // This matches PyTorch behavior: @@ -270,15 +270,6 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); ggml_set_input(inp->embd); - // For vision, we need per_layer_inputs from padding token (ID=0) - // We CANNOT use inp->tokens because batch allows EITHER tokens OR embeddings - // - // The challenge: We need to broadcast padding token embedding from [embd_size, 1] to [embd_size, n_tokens] - // but ggml_repeat+ggml_dup doesn't work in no_alloc mode (creates views without backing memory). - // - // Solution: Use ggml_add to broadcast! GGML automatically broadcasts along compatible dimensions. - // We create zeros of shape [embd_size, n_tokens], then add padding_emb [embd_size, 1] which broadcasts. - // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 5d8c46862bd..6a9efb933e5 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -73,8 +73,7 @@ struct clip_graph { ggml_tensor * rms_norm_2d( ggml_tensor * inp, ggml_tensor * weight, - float eps = 1e-6f, - int block_idx=-1); + float eps = 1e-6f); ggml_tensor* pad_same_2d( ggml_tensor* inp, @@ -88,19 +87,16 @@ struct clip_graph { ggml_tensor * build_edge_residual( ggml_tensor * inp, const mobilenetv5_block & block, - int stride, - int block_idx = -1); + int stride); ggml_tensor * build_inverted_residual( ggml_tensor * inp, const mobilenetv5_block & block, - int stride, - int block_idx = -1); + int stride); ggml_tensor * build_mobilenet_attn( ggml_tensor * inp, - const mobilenetv5_block & block, - int block_idx = -1); + const mobilenetv5_block & block); ggml_tensor * build_norm( ggml_tensor * cur, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index e03f455b1b5..be168b97ef2 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -329,19 +329,19 @@ struct clip_model { // mobilenetv5 for gemma3n std::vector mobilenet_blocks; - std::vector mobilenet_stage_ends; // NEW: Track end indices of stages + std::vector mobilenet_stage_ends; ggml_tensor * mobilenet_stem_conv_w = nullptr; ggml_tensor * mobilenet_stem_conv_b = nullptr; ggml_tensor * mobilenet_stem_norm_w = nullptr; ggml_tensor * mm_post_proj_norm_w = nullptr; // Multi-Scale Fusion Adapter (MSFA) components - ggml_tensor * msfa_concat_conv_w = nullptr; // Concatenated feature processing + ggml_tensor * msfa_concat_conv_w = nullptr; ggml_tensor * msfa_concat_norm_w = nullptr; - ggml_tensor * msfa_ffn_expand_w = nullptr; // FFN expansion - ggml_tensor * msfa_ffn_project_w = nullptr; // FFN projection - ggml_tensor * msfa_ffn_expand_bn = nullptr; // NEW: FFN expansion batch norm - ggml_tensor * msfa_ffn_project_bn = nullptr; // NEW: FFN projection batch norm + ggml_tensor * msfa_ffn_expand_w = nullptr; + ggml_tensor * msfa_ffn_project_w = nullptr; + ggml_tensor * msfa_ffn_expand_bn = nullptr; + ggml_tensor * msfa_ffn_project_bn = nullptr; // pixtral, glm4v diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 4c357aab19e..9e4519c502b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -263,69 +263,26 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { } } -// Helper: Normalize over the Channel dimension (dim 2 in [W, H, C, B]) +// --- Helpers for MobileNetV5 Blocks --- // RMS Norm 2D - normalizes over channels for each spatial position -// PyTorch: v = torch.mean(x.pow(2), dim=1) - mean over C for each (N,H,W) -// We need to normalize each spatial position across its C channels -ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps, int block_idx) { +ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { // inp: [W, H, C, B] - const int64_t W = inp->ne[0]; - const int64_t H = inp->ne[1]; - const int64_t C = inp->ne[2]; - const int64_t B = inp->ne[3]; - // Step 1: Permute [W, H, C, B] -> [C, W, H, B] - // Puts Channels in ne[0] (contiguous) ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); cur = ggml_cont(ctx0, cur); - - // Step 2: Reshape [C, W, H, B] -> [C, W*H*B] - // We now have a 2D matrix where columns are Channels (ne[0]) - // and rows are Spatial/Batch (ne[1]). - // cur = ggml_reshape_2d(ctx0, cur, C, W * H * B); - - // REMOVED Step 3 (Transpose). - // We WANT ne[0] to be C so rms_norm reduces over it. - - // Step 4: Apply RMS Norm - // Normalizes ne[0] (C) for every element in ne[1] (Spatial/Batch). cur = ggml_rms_norm(ctx0, cur, eps); - - // Step 5: Apply weight if present + if (weight) { - // weight is [C] - // cur is [C, W*H*B] - // ggml_mul broadcasts automatically along higher dims. - // It multiplies element i of weight with element i of cur's ne[0]. cur = ggml_mul(ctx0, cur, weight); } - // REMOVED Step 6 (Transpose back). We never transposed. - - // Step 7: Reshape back to [C, W, H, B] - // cur = ggml_reshape_4d(ctx0, cur, C, W, H, B); - - // Step 8: Permute back to [W, H, C, B] - // ne[0]=C, ne[1]=W, ne[2]=H, ne[3]=B - // We want new ne[0] to be old ne[1] (W) - // We want new ne[1] to be old ne[2] (H) - // We want new ne[2] to be old ne[0] (C) - // We want new ne[3] to be old ne[3] (B) cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); - - // cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - // Note: The second permute in your original code was likely redundant/incorrect - // after the first one. A single permute is sufficient to restore order. cur = ggml_cont(ctx0, cur); return cur; } - -// ------------------------------------------------------------------------ // Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF) -// ------------------------------------------------------------------------ ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { const int64_t ih = inp->ne[1]; // height const int64_t iw = inp->ne[0]; // width @@ -358,31 +315,20 @@ ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_ return inp; } -// ------------------------------------------------------------------------ -// Edge Residual Block (Stage 0) - CORRECTED -// ------------------------------------------------------------------------ -ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) { + +// Edge Residual Block (Stage 0) +ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { ggml_tensor * cur = inp; // 1. Expansion Conv (3x3) - // -------------------------------------------------------------------- - // LOGIC FIX: - // Block 0 (stride=2): Uses "Conv2dSame". We must manually pad, then conv with pad=0. - // Block 1,2 (stride=1): Uses standard "Conv2d" with padding=(1,1). - // -------------------------------------------------------------------- - if (stride == 2) { // Case: Downsampling (Block 0) // Replicates Conv2dSame(kernel=3, stride=2) - // We calculate asymmetric padding dynamically cur = pad_same_2d(cur, 3, 3, stride, stride); - - // Perform conv with 0 padding because we just applied it manually cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); } else { // Case: Normal 3x3 Block (Block 1, 2) // Replicates Conv2d(kernel=3, stride=1, padding=1) - // Standard symmetric padding of 1 is sufficient for 3x3 s1 to keep dims same cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); } @@ -404,7 +350,7 @@ ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenet return cur; } -ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride, int block_idx) { +ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { ggml_tensor * cur = inp; // 1. Depthwise Start (Optional) @@ -412,7 +358,6 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil if (block.dw_start_w) { int k = block.dw_start_w->ne[0]; // 3 or 5 int p = k / 2; - // cur = ggml_conv_2d_dw_direct(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); } @@ -462,8 +407,6 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); bool same_channel = (inp->ne[2] == cur->ne[2]); if (same_spatial && same_channel) { - // --- FIXED LAYER SCALING --- - // --------------------------- cur = ggml_add(ctx0, cur, inp); } @@ -471,24 +414,12 @@ ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobil } // MobileNetV5 Builder (Gemma 3n) - Attention Block -ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block, int block_idx) { - - // ... [Debug Helpers kept same as original] ... - // auto DEBUG_SHAPE = [&](const char* label, ggml_tensor* t) { /* ... */ }; - // auto REGISTER_DEBUG = [&](const std::string& name, ggml_tensor* t) { /* ... */ }; - - // // Debug input - // if (block_idx == 33 || block_idx == 50 || block_idx == 52) { - // char debug_name[128]; - // snprintf(debug_name, sizeof(debug_name), "block%d_input", block_idx); - // REGISTER_DEBUG(debug_name, inp); - // } - +ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { ggml_tensor * cur = inp; // --- Norm --- if (block.attn_norm_w) { - cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f, block_idx); + cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); } // --- 1. Q Calculation --- @@ -502,7 +433,7 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 if (block.attn_k_norm_w) { - k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f, block_idx); + k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f); } } ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); @@ -515,13 +446,11 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 if (block.attn_v_norm_w) { - v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f, block_idx); + v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f); } } ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); - // --- Reshape & Permute Logic --- - const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; const int D = k->ne[2]; // Head dimension const int n_head = q->ne[2] / D; @@ -543,8 +472,6 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene k = ggml_cont(ctx0, k); // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] - // NOTE: We keep V as [M, D] because ggml_mul_mat expects src0^T * src1. - // To get output [D, N], we will need [M, D]^T * [M, N]. v = ggml_reshape_3d(ctx0, v, M, D, B); v = ggml_reshape_4d(ctx0, v, M, D, 1, B); v = ggml_cont(ctx0, v); // [M, D, 1, B] @@ -553,82 +480,32 @@ ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilene float scale = 1.0f / sqrtf((float)D); // Step 1: Compute Q @ K.T - // Q: [D, N, n_head, B] - // K: [D, M, 1, B] - // ggml_mul_mat computes K^T * Q -> [D, M]^T * [D, N] -> [M, D] * [D, N] -> [M, N] - // Implicit Broadcast: K has 1 head, Q has n_head. ggml handles this automatically. - ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); // Result: [M, N, n_head, B] (in ggml layout) - - // // Debug scores - // if (block_idx == 33) { - // char debug_name[128]; - // snprintf(debug_name, sizeof(debug_name), "block%d_scores_raw", block_idx); - // REGISTER_DEBUG(debug_name, scores); - // } + ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); scores = ggml_scale(ctx0, scores, scale); - // Step 2: Softmax - // scores is [M, N, n_head, B] (ne0=M, ne1=N) - // We need softmax over M (keys). - // ggml_soft_max applies to dim 0, which is M. Perfect - no permute needed! scores = ggml_soft_max(ctx0, scores); - // Step 3: Compute Attn @ V - // V: [M, D, 1, B] (ne0=M, ne1=D) - // Scores: [M, N, n_head, B] (ne0=M, ne1=N) - // - // ggml_mul_mat computes V^T * Scores -> [M, D]^T * [M, N] -> [D, M] * [M, N] -> [D, N] - // Implicit Broadcast: V has 1 head, Scores has n_head. ggml handles this automatically. - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); // Result: [N, D, n_head, B] - - // // Debug kqv - // if (block_idx == 33) { - // char debug_name[128]; - // snprintf(debug_name, sizeof(debug_name), "block%d_kqv_out", block_idx); - // REGISTER_DEBUG(debug_name, kqv); - // } - - // --- Reshape back to spatial layout --- - // kqv is [N, D, n_head, B]. We want [D, N, n_head, B] to merge heads. - kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); // [D, N, n_head, B] + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); + + kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); kqv = ggml_cont(ctx0, kqv); - // Reshape to [N, D*n_head, B] then [W, H, C, B] + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); kqv = ggml_cont(ctx0, kqv); -// Output projection + // Output projection cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); // --- Residual & Layer Scale (FIXED) --- if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { if (block.layer_scale_w) { - // FIX: Simplified Layer Scale. No permute needed. - // Tensor is [W, H, C, B]. Weight is [C]. - // We reshape Weight to [1, 1, C, 1]. - // GGML will broadcast W and H dimensions automatically. - - // Debug print shape of block.layer_scale_w - // fprintf(stderr, "DEBUG: block %d layer_scale_w shape: [%ld x %ld x %ld x %ld]\n", block_idx, block.layer_scale_w->ne[0], block.layer_scale_w->ne[1], block.layer_scale_w->ne[2], block.layer_scale_w->ne[3]); - - // Debug print shape of cur before scaling - // fprintf(stderr, "DEBUG: block %d cur shape before scaling: [%ld x %ld x %ld x %ld]\n", block_idx, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]); - - ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, 1, 1, block.layer_scale_w->ne[0], 1); - - // Debug print shape of scale_w_reshaped - // fprintf(stderr, "DEBUG: block %d scale_w_reshaped shape: [%ld x %ld x %ld x %ld]\n", block_idx, scale_w_reshaped->ne[0], scale_w_reshaped->ne[1], scale_w_reshaped->ne[2], scale_w_reshaped->ne[3]); - cur = ggml_mul(ctx0, cur, scale_w_reshaped); } - - // Residual Addition - // 'cur' is the pointer to the graph node of the attention output. - // 'inp' is the pointer to the graph node of the block input. cur = ggml_add(ctx0, cur, inp); } diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 9946ca6afa8..88bd1e6fcb9 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -9,8 +9,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2)) ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding - // ggml_tensor * mobilenet_stem_conv_w_fixed = fix_1x1_weight(model.mobilenet_stem_conv_w); - cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0 if (model.mobilenet_stem_conv_b) { // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B] @@ -47,22 +45,9 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { const auto & block = model.mobilenet_blocks[i]; int stride = is_stage_start(i) ? 2 : 1; - // Debug block type - const char* block_type = block.s0_conv_exp_w ? "edge_residual" : - block.attn_q_w ? "attention" : "inverted_residual"; - - // // Debug input for problematic blocks - // if (i >= 50 && i <= 54) { - // fprintf(stderr, "DEBUG: Block %d (%s) input shape: [%ld, %ld, %ld, %ld], stride=%d\n", - // i, block_type, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], stride); - // } - - if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride, i); - else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block, i); - else cur = build_inverted_residual(cur, block, stride, i); - - // Register block output for debugging - char block_name[64]; + if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride); + else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block); + else cur = build_inverted_residual(cur, block, stride); if (is_fusion_point(i)) { @@ -94,7 +79,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // ggml_upscale generally takes integer factors or target sizes depending on helper. // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2). int scale_w = high_res_w / feat_w; - int scale_h = high_res_h / feat_h; + // int scale_h = high_res_h / feat_h; // Safety check for non-integer scaling if strictly replicating if (high_res_w % feat_w != 0) { @@ -122,9 +107,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 1x1 Conv cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1); - // MISSING IN YOUR CODE: Expansion Norm if (model.msfa_ffn_expand_bn) { - cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); // Helper to apply RMSNorm + cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); } cur = ggml_gelu(ctx0, cur); @@ -136,7 +120,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 1x1 Conv cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1); - // MISSING IN YOUR CODE: Projection Norm // UniversalInvertedResidual typically has a norm after projection if (model.msfa_ffn_project_bn) { cur = rms_norm_2d(cur, model.msfa_ffn_project_bn); @@ -151,17 +134,11 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { if (current_w > target_out_res) { int s = current_w / target_out_res; - - // PyTorch Logic: - // if divisible: avg_pool - // if not divisible: bilinear interpolate (hard to do in pure ggml, usually assumed divisible here) - + if (current_w % target_out_res == 0) { // Avg Pool: Kernel=s, Stride=s cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); } else { - // Fallback or Error: ggml doesn't easily support bilinear downsampling - // without custom ops, but standard models usually stick to integer strides. fprintf(stderr, "Error: Irregular downsampling stride required.\n"); } @@ -174,7 +151,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { } } - // 4. Gemma 3n Multimodal Projection (Embedder) - FULL FIX + // 4. Gemma 3n Multimodal Projection (Embedder) // Input: 'cur' is [Width, Height, Channels, Batch] int W = cur->ne[0]; int H = cur->ne[1]; @@ -189,7 +166,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { cur = ggml_cont(ctx0, cur); - // 2. FEATURE SCALING (Missing in your original code) + // 2. FEATURE SCALING // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5 // This prevents the signal from vanishing during the subsequent RMSNorm. const float scale_factor = sqrtf((float)C); @@ -227,8 +204,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False) // with_scale=False means weight is registered as buffer with value 1.0 // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1 - // NOTE: Vision embeddings intentionally have magnitude ~1, different from - // text embeddings at ~sqrt(n_embd). The model was trained with this mismatch. { const float eps = 1e-6f; cur = ggml_rms_norm(ctx0, cur, eps); @@ -239,9 +214,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { } } - - // cur = ggml_scale(ctx0, cur, scale_factor); - ggml_build_forward_expand(gf, cur); return gf; } \ No newline at end of file From 4589d3eb748c48a33446f6d1465cb8b9a65d3635 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sun, 21 Dec 2025 10:59:15 +0000 Subject: [PATCH 04/20] Fix permute and remove transpose of projection weights --- tools/mtmd/models/mobilenetv5.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 88bd1e6fcb9..6dd1a3d465d 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -160,7 +160,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 1. Permute and Flatten to [Channels, Tokens, Batch] // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch) - cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // -> [C, W, H, B] + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B] + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B] cur = ggml_cont(ctx0, cur); cur = ggml_reshape_3d(ctx0, cur, C, W*H, B); cur = ggml_cont(ctx0, cur); @@ -193,11 +194,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // Need to transpose for ggml_mul_mat which computes A^T * B // This matches Gemma3's projection at line ~1319 which also transposes if (model.mm_input_proj_w) { - // cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); - cur = ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), - cur); - + cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); } // 5. POST PROJECTION NORM From 47423a295ba1c272d38b85f98e6da89be995b7c0 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sun, 21 Dec 2025 11:50:10 +0000 Subject: [PATCH 05/20] Fix comments, remove debugging prints from hf_to_gguf --- convert_hf_to_gguf.py | 17 +++++------------ tools/mtmd/models/mobilenetv5.cpp | 4 +--- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 36a7ed000af..dd94efe7ed0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6018,22 +6018,16 @@ def find_vparam(self, keys: list[str], optional: bool = False) -> Any: return super().find_vparam(keys, optional) def set_gguf_parameters(self): - # MobileNetV5 requires ImageNet normalization values - # Override preprocessor_config to ensure correct values before calling super() - # IMAGENET_MEAN = [0.485, 0.456, 0.406] - # IMAGENET_STD = [0.229, 0.224, 0.225] + # MobileNetV5 does not use normalisation at all IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ] IMAGENET_STD = [0.5 , 0.5 , 0.5 ] - print("test") - # Check if preprocessor_config has incorrect normalization values if "image_mean" in self.preprocessor_config: current_mean = self.preprocessor_config["image_mean"] if current_mean != IMAGENET_MEAN: logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}") self.preprocessor_config["image_mean"] = IMAGENET_MEAN - print("test2") else: logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}") self.preprocessor_config["image_mean"] = IMAGENET_MEAN @@ -6060,7 +6054,6 @@ def set_gguf_parameters(self): # Image sequence length (256 tokens = 16x16 for Gemma3n) image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - # Note: Additional metadata can be added as needed def tensor_force_quant(self, name, new_name, bid, n_dims): # Force quantization settings for specific tensor types @@ -6110,17 +6103,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def map_tensor_name(self, name: str) -> str: """Map Gemma3n tensor names to GGUF format""" # Projector tensors (from embed_vision) - use mm. prefix like Gemma3 - # IMPORTANT: Keep the .weight suffix to match C++ expectations + # IMPORTANT: Keep the .weight suffix to match ggml expectations if name == "embedding.weight": return "mm.embedding.weight" if name == "embedding_projection.weight": - return "mm.input_projection.weight" # Main projection used by C++ + return "mm.input_projection.weight" # Main projection if name == "hard_emb_norm.weight": return "mm.hard_emb_norm.weight" # Hard embedding normalization if name == "soft_emb_norm.weight": - return "mm.soft_emb_norm.weight" # Soft embedding normalization (used by C++) + return "mm.soft_emb_norm.weight" # Soft embedding normalization if name == "post_proj_norm.weight": - return "mm.post_proj_norm.weight" # Post projection normalization (CRITICAL for Gemma3n) + return "mm.post_proj_norm.weight" # Post projection normalization (if exists) # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder if name.startswith("vision_tower."): diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 6dd1a3d465d..930da38e302 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -55,7 +55,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { } } - // 3. Multi-Scale Fusion Adapter (MSFA) - REPLICATED & FIXED + // 3. Multi-Scale Fusion Adapter (MSFA) if (!intermediate_features.empty()) { // A. Reference Resolution: PyTorch implementation uses inputs[0] @@ -191,8 +191,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 4. PROJECTION // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] - // Need to transpose for ggml_mul_mat which computes A^T * B - // This matches Gemma3's projection at line ~1319 which also transposes if (model.mm_input_proj_w) { cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); } From 67801e5b62a68db509ef879a0c47b5bf096df785 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sun, 21 Dec 2025 19:13:47 +0000 Subject: [PATCH 06/20] 1. Hard-code image_mean = 0 and image_std = 1 2. Use available tensor mapping logic 3. Remove redundant chat template replacement of soft tokens placeholder with media placeholder --- convert_hf_to_gguf.py | 113 +++++---------------------------- gguf-py/gguf/constants.py | 9 +++ gguf-py/gguf/tensor_mapping.py | 21 ++++++ 3 files changed, 46 insertions(+), 97 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dd94efe7ed0..55e82fe9128 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5966,9 +5966,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel") class Gemma3nVisionModel(MmprojModel): """Vision encoder converter for Gemma3n using MobileNetV5 architecture""" - - # MobileNetV5 doesn't have transformer layers, so we don't need block count - # Set n_block_keys to empty list to skip the find_hparam check n_block_keys = [] def find_hparam(self, keys: list[str], optional: bool = False) -> Any: @@ -5984,34 +5981,17 @@ def __init__(self, *args, **kwargs): def find_vparam(self, keys: list[str], optional: bool = False) -> Any: """Override to provide hardcoded MobileNetV5 parameters that aren't in config""" - # MobileNetV5 hardcodes these values in the architecture definition - # rather than storing them in config.json - # Handle empty keys list (n_block_keys) - return 0 for CNN architecture if not keys: return 0 - # Check if we're looking for image_size - if "image_size" in keys: - # MobileNetV5 300m_enc uses 768x768 input - return 768 - - # Check if we're looking for patch_size - if "patch_size" in keys: - # MobileNetV5 is CNN-based, doesn't use patches - # Set to 1 for compatibility - return 1 - - # Check if we're looking for intermediate_size if "intermediate_size" in keys: - # MobileNetV5 uses expansion ratios in inverted residual blocks # Typical expansion is 4x the embedding dimension hidden_size = self.hparams_vision.get("hidden_size", 2048) return hidden_size * 4 - # Check if we're looking for num_attention_heads if "num_attention_heads" in keys or "num_heads" in keys: - # MobileNetV5 uses Multi-Query Attention with 8 heads + # Multi-Query Attention with 8 heads return 8 # For other parameters, use parent implementation @@ -6019,41 +5999,25 @@ def find_vparam(self, keys: list[str], optional: bool = False) -> Any: def set_gguf_parameters(self): # MobileNetV5 does not use normalisation at all - IMAGENET_MEAN = [0.5 , 0.5 , 0.5 ] - IMAGENET_STD = [0.5 , 0.5 , 0.5 ] - - # Check if preprocessor_config has incorrect normalization values - if "image_mean" in self.preprocessor_config: - current_mean = self.preprocessor_config["image_mean"] - if current_mean != IMAGENET_MEAN: - logger.warning(f"Overriding image_mean from {current_mean} to ImageNet standard {IMAGENET_MEAN}") - self.preprocessor_config["image_mean"] = IMAGENET_MEAN - else: - logger.info(f"Setting image_mean to ImageNet standard {IMAGENET_MEAN}") - self.preprocessor_config["image_mean"] = IMAGENET_MEAN - - if "image_std" in self.preprocessor_config: - current_std = self.preprocessor_config["image_std"] - if current_std != IMAGENET_STD: - logger.warning(f"Overriding image_std from {current_std} to ImageNet standard {IMAGENET_STD}") - self.preprocessor_config["image_std"] = IMAGENET_STD - else: - logger.info(f"Setting image_std to ImageNet standard {IMAGENET_STD}") - self.preprocessor_config["image_std"] = IMAGENET_STD + self.preprocessor_config["image_mean"] = [0.0 , 0.0 , 0.0 ] + self.preprocessor_config["image_std"] = [1.0 , 1.0 , 1.0 ] + self.hparams_vision["image_size"] = self.preprocessor_config.get( + "size", {"height": 768, "width": 768} + )["height"] + + # Image sequence length (256 tokens = 16x16 for Gemma3n) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + image_size = self.hparams_vision["image_size"] + self.hparams_vision["patch_size"] = image_size // image_seq_length # Now call parent which will use the corrected values super().set_gguf_parameters() - hparams = self.hparams # Set projector type to GEMMA3N self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N) # MobileNetV5 specific parameters - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_use_gelu(True) # MobileNetV5 uses approximate GELU - - # Image sequence length (256 tokens = 16x16 for Gemma3n) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) def tensor_force_quant(self, name, new_name, bid, n_dims): # Force quantization settings for specific tensor types @@ -6090,7 +6054,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Handle normalization layer naming name = name.replace("hard_embedding_norm", "hard_emb_norm") name = name.replace("soft_embedding_norm", "soft_emb_norm") - # name = name.replace("embedding_post_projection_norm", "post_proj_norm") # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1 @@ -6098,37 +6061,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # No correction needed for Gemma3n pass - return [(self.map_tensor_name(name), data_torch)] - - def map_tensor_name(self, name: str) -> str: - """Map Gemma3n tensor names to GGUF format""" - # Projector tensors (from embed_vision) - use mm. prefix like Gemma3 - # IMPORTANT: Keep the .weight suffix to match ggml expectations - if name == "embedding.weight": - return "mm.embedding.weight" - if name == "embedding_projection.weight": - return "mm.input_projection.weight" # Main projection - if name == "hard_emb_norm.weight": - return "mm.hard_emb_norm.weight" # Hard embedding normalization - if name == "soft_emb_norm.weight": - return "mm.soft_emb_norm.weight" # Soft embedding normalization - if name == "post_proj_norm.weight": - return "mm.post_proj_norm.weight" # Post projection normalization (if exists) - - # Vision tower tensors - add v.enc. prefix for MobileNetV5 encoder if name.startswith("vision_tower."): - # Remove vision_tower prefix and add v.enc. prefix - tensor_suffix = name[13:] # Remove "vision_tower." - return f"v.enc.{tensor_suffix}" - - # If no match, try parent implementation - try: - return super().map_tensor_name(name) - except ValueError: - # If parent also can't map it, provide a sensible default - # This shouldn't happen, but provides a fallback - logger.warning(f"Using fallback mapping for tensor: {name}") - return f"v.{name}" + tensor_suffix = name[13:] + return [(f"v.enc.{tensor_suffix}", data_torch)] + else: + return [(self.map_tensor_name(name), data_torch)] @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") @@ -6173,24 +6110,6 @@ def set_vocab(self): if vocab_size_per_layer_input is not None: self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input - # Fix chat template for Gemma3n multimodal: replace special token placeholders with mtmd markers - # The mtmd library uses <__media__> as the default marker for images/audio - # but Gemma3n's chat template uses and - chat_template_key = "tokenizer.chat_template" - for kv_dict in self.gguf_writer.kv_data: - if chat_template_key in kv_dict: - template_value = kv_dict[chat_template_key].value - - # Replace soft token placeholders with mtmd markers - if '' in template_value or '' in template_value: - logger.info("Fixing Gemma3n chat template: replacing soft token placeholders with mtmd markers") - template_value = template_value.replace('', '<__media__>') - template_value = template_value.replace('', '<__media__>') - - # Update the value in place - kv_dict[chat_template_key].value = template_value - break - def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 41654b22b5d..869a8582b12 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -667,6 +667,9 @@ class MODEL_TENSOR(IntEnum): V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_MM_EMBEDDING = auto() # gemma3n + V_MM_HARD_EMB_NORM = auto() # gemma3n + V_MM_POST_PROJ_NORM = auto() # gemma3n V_RESMPL_POS_EMBD_K = auto() # minicpmv V_RESMPL_ATTN_Q = auto() # minicpmv V_RESMPL_ATTN_K = auto() # minicpmv @@ -1059,6 +1062,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", + MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", + MODEL_TENSOR.V_MM_POST_PROJ_NORM: "mm.post_proj_norm", MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", @@ -1157,6 +1163,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_INP_PROJ, MODEL_TENSOR.V_MM_INP_NORM, MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_MM_EMBEDDING, + MODEL_TENSOR.V_MM_HARD_EMB_NORM, + MODEL_TENSOR.V_MM_POST_PROJ_NORM, MODEL_TENSOR.V_RESMPL_POS_EMBD_K, MODEL_TENSOR.V_RESMPL_ATTN_Q, MODEL_TENSOR.V_RESMPL_ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 301aafa9102..3e1cf8a136f 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -119,6 +119,27 @@ class TensorNameMap: MODEL_TENSOR.CONV1D: ( "backbone.embed", # roberta ), + + # Vision multimodal projector tensors (non-block) for gemma3n + MODEL_TENSOR.V_MM_INP_PROJ: ( + "embedding_projection", # gemma3n + ), + + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( + "soft_emb_norm", # gemma3n + ), + + MODEL_TENSOR.V_MM_EMBEDDING: ( + "embedding", # gemma3n + ), + + MODEL_TENSOR.V_MM_HARD_EMB_NORM: ( + "hard_emb_norm", # gemma3n + ), + + MODEL_TENSOR.V_MM_POST_PROJ_NORM: ( + "post_proj_norm", # gemma3n + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { From 04947c7f9e355914e7e4d6cdf79c28193218d988 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sun, 21 Dec 2025 19:19:47 +0000 Subject: [PATCH 07/20] 1. Move mobilenetv5 helpers declarations to `clip_graph_mobilenetv5` struct and definitions to mobilenetv5.cpp 2.Remove unused `clip_is_gemma3n` func declarations and definitions 3. Remove redundant `rescale_image_u8_to_f32` func and use `normalize_image_u8_to_f32` with zero mean and unit std 4. Calculate n_patches using image_size / patch_size --- tools/mtmd/clip-graph.h | 28 --- tools/mtmd/clip.cpp | 271 +----------------------------- tools/mtmd/clip.h | 1 - tools/mtmd/models/mobilenetv5.cpp | 249 +++++++++++++++++++++++++++ tools/mtmd/models/models.h | 28 +++ 5 files changed, 279 insertions(+), 298 deletions(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 6a9efb933e5..2b1915779f2 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -70,34 +70,6 @@ struct clip_graph { ggml_tensor * build_inp_raw(int channels = 3); - ggml_tensor * rms_norm_2d( - ggml_tensor * inp, - ggml_tensor * weight, - float eps = 1e-6f); - - ggml_tensor* pad_same_2d( - ggml_tensor* inp, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int dilation_h = 1, - int dilation_w = 1); - - ggml_tensor * build_edge_residual( - ggml_tensor * inp, - const mobilenetv5_block & block, - int stride); - - ggml_tensor * build_inverted_residual( - ggml_tensor * inp, - const mobilenetv5_block & block, - int stride); - - ggml_tensor * build_mobilenet_attn( - ggml_tensor * inp, - const mobilenetv5_block & block); - ggml_tensor * build_norm( ggml_tensor * cur, ggml_tensor * mw, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9e4519c502b..e86a09bb5c1 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -263,255 +263,6 @@ void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { } } -// --- Helpers for MobileNetV5 Blocks --- -// RMS Norm 2D - normalizes over channels for each spatial position -ggml_tensor * clip_graph::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { - // inp: [W, H, C, B] - - ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); - cur = ggml_cont(ctx0, cur); - cur = ggml_rms_norm(ctx0, cur, eps); - - if (weight) { - cur = ggml_mul(ctx0, cur, weight); - } - - cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); - cur = ggml_cont(ctx0, cur); - - return cur; -} - -// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF) -ggml_tensor* clip_graph::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { - const int64_t ih = inp->ne[1]; // height - const int64_t iw = inp->ne[0]; // width - - // Calculate output size (ceil division) - const int64_t oh = (ih + stride_h - 1) / stride_h; - const int64_t ow = (iw + stride_w - 1) / stride_w; - - // Calculate padding needed - const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih); - const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw); - - // Split padding asymmetrically - const int pad_h_top = pad_h / 2; - const int pad_h_bottom = pad_h - pad_h_top; - const int pad_w_left = pad_w / 2; - const int pad_w_right = pad_w - pad_w_left; - - // Apply padding if needed - // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) - // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch - if (pad_h > 0 || pad_w > 0) { - inp = ggml_pad_ext(ctx0, inp, - pad_w_left, pad_w_right, // width padding (dim 0) - pad_h_top, pad_h_bottom, // height padding (dim 1) - 0, 0, // no channel padding (dim 2) - 0, 0); // no batch padding (dim 3) - } - - return inp; -} - - -// Edge Residual Block (Stage 0) -ggml_tensor * clip_graph::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { - ggml_tensor * cur = inp; - - // 1. Expansion Conv (3x3) - if (stride == 2) { - // Case: Downsampling (Block 0) - // Replicates Conv2dSame(kernel=3, stride=2) - cur = pad_same_2d(cur, 3, 3, stride, stride); - cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); - } else { - // Case: Normal 3x3 Block (Block 1, 2) - // Replicates Conv2d(kernel=3, stride=1, padding=1) - cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); - } - - // BN + Activation - if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w); - cur = ggml_gelu(ctx0, cur); - - // 2. Pointwise Linear Conv (1x1) - // 1x1 Convs usually have padding=0 and stride=1 - cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1); - if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w); - - // 3. Residual Connection - // Only apply residual if spatial dimensions and channels match (stride 1) - if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) { - cur = ggml_add(ctx0, cur, inp); - } - - return cur; -} - -ggml_tensor * clip_graph::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { - ggml_tensor * cur = inp; - - // 1. Depthwise Start (Optional) - // NOTE: dw_start always has stride=1 (no downsampling here) - if (block.dw_start_w) { - int k = block.dw_start_w->ne[0]; // 3 or 5 - int p = k / 2; - cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); - if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); - } - - // 2. Pointwise Expansion (1x1) - if (block.pw_exp_w) { - // Standard 1x1 conv, pad=0, stride=1 - cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1); - if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w); - cur = ggml_gelu(ctx0, cur); - } - - // 3. Depthwise Mid (Optional) - // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) - if (block.dw_mid_w) { - int k = block.dw_mid_w->ne[0]; // 3 or 5 - - if (stride > 1) { - // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding - cur = pad_same_2d(cur, k, k, stride, stride); - cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0 - } else { - // Case: Stride 1 -> Use Standard Symmetric Padding - int p = k / 2; - cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1); - } - - if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w); - cur = ggml_gelu(ctx0, cur); - } - - // 4. Pointwise Projection (1x1) - if (block.pw_proj_w) { - cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1); - if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w); - } - - // Apply Layer Scaling if present - if (block.layer_scale_w) { - ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, - 1, 1, block.layer_scale_w->ne[0], 1); - - cur = ggml_mul(ctx0, cur, scale_w_reshaped); - } - - // 5. Residual Connection - bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); - bool same_channel = (inp->ne[2] == cur->ne[2]); - if (same_spatial && same_channel) { - cur = ggml_add(ctx0, cur, inp); - } - - return cur; -} - -// MobileNetV5 Builder (Gemma 3n) - Attention Block -ggml_tensor * clip_graph::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { - ggml_tensor * cur = inp; - - // --- Norm --- - if (block.attn_norm_w) { - cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); - } - - // --- 1. Q Calculation --- - ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); - - // --- 2. K Calculation (Downsampled) --- - // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) - ggml_tensor * k_inp = cur; - if (block.attn_k_dw_w) { - int k_size = block.attn_k_dw_w->ne[0]; // Usually 3 - k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding - k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 - if (block.attn_k_norm_w) { - k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f); - } - } - ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); - - // --- 3. V Calculation (Downsampled) --- - // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) - ggml_tensor * v_inp = cur; - if (block.attn_v_dw_w) { - int v_size = block.attn_v_dw_w->ne[0]; // Usually 3 - v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding - v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 - if (block.attn_v_norm_w) { - v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f); - } - } - ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); - - const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; - const int D = k->ne[2]; // Head dimension - const int n_head = q->ne[2] / D; - const int N = W * H; - - // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B] - q = ggml_reshape_3d(ctx0, q, N, D*n_head, B); - q = ggml_reshape_4d(ctx0, q, N, D, n_head, B); - q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B] - q = ggml_cont(ctx0, q); - - const int Wk = k->ne[0]; const int Hk = k->ne[1]; - const int M = Wk * Hk; - - // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] - k = ggml_reshape_3d(ctx0, k, M, D, B); - k = ggml_reshape_4d(ctx0, k, M, D, 1, B); - k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B] - k = ggml_cont(ctx0, k); - - // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] - v = ggml_reshape_3d(ctx0, v, M, D, B); - v = ggml_reshape_4d(ctx0, v, M, D, 1, B); - v = ggml_cont(ctx0, v); // [M, D, 1, B] - - // --- Multi-Query Attention --- - float scale = 1.0f / sqrtf((float)D); - - // Step 1: Compute Q @ K.T - ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); - - scores = ggml_scale(ctx0, scores, scale); - - scores = ggml_soft_max(ctx0, scores); - - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); - - kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); - kqv = ggml_cont(ctx0, kqv); - - - kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); - kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); - kqv = ggml_cont(ctx0, kqv); - - // Output projection - cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); - - // --- Residual & Layer Scale (FIXED) --- - if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { - if (block.layer_scale_w) { - ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, - 1, 1, block.layer_scale_w->ne[0], 1); - cur = ggml_mul(ctx0, cur, scale_w_reshaped); - } - cur = ggml_add(ctx0, cur, inp); - } - - return cur; -} - // siglip2 naflex ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; @@ -2414,18 +2165,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } -// Rescale image from u8 to f32 without normalization (for models like GEMMA3N that use SiglipImageProcessorFast) -// This only converts from [0, 255] to [0.0, 1.0] range without applying mean/std normalization -static void rescale_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(src.buf[i]) / 255.0f; - } -} - // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { dst.nx = src.nx; @@ -3123,13 +2862,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_GEMMA3N: { - // GEMMA3N uses SiglipImageProcessorFast which only rescales to [0.0, 1.0] without normalization - // Resize to 768x768 using bilinear interpolation, then rescale to f32 clip_image_u8 resized_image; int sz = params.image_size; img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false); clip_image_f32_ptr img_f32(clip_image_f32_init()); - rescale_image_u8_to_f32(resized_image, *img_f32); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(img_f32)); } break; @@ -3396,7 +3133,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution // regardless of input size (see architecture description) - n_patches = 16 * 16; // 256 tokens + n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size; } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: @@ -3969,10 +3706,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; } -bool clip_is_gemma3n(const struct clip_ctx * ctx) { - return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3N; -} - bool clip_has_vision_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_VISION; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index c244df2677f..68a0d6e857e 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -107,7 +107,6 @@ bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); -bool clip_is_gemma3n(const struct clip_ctx * ctx); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 930da38e302..bc1185c10eb 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -1,5 +1,254 @@ #include "models.h" +// --- Helpers for MobileNetV5 Blocks --- +// RMS Norm 2D - normalizes over channels for each spatial position +ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { + // inp: [W, H, C, B] + + ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + cur = ggml_rms_norm(ctx0, cur, eps); + + if (weight) { + cur = ggml_mul(ctx0, cur, weight); + } + + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + + return cur; +} + +// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF) +ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { + const int64_t ih = inp->ne[1]; // height + const int64_t iw = inp->ne[0]; // width + + // Calculate output size (ceil division) + const int64_t oh = (ih + stride_h - 1) / stride_h; + const int64_t ow = (iw + stride_w - 1) / stride_w; + + // Calculate padding needed + const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih); + const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw); + + // Split padding asymmetrically + const int pad_h_top = pad_h / 2; + const int pad_h_bottom = pad_h - pad_h_top; + const int pad_w_left = pad_w / 2; + const int pad_w_right = pad_w - pad_w_left; + + // Apply padding if needed + // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch + if (pad_h > 0 || pad_w > 0) { + inp = ggml_pad_ext(ctx0, inp, + pad_w_left, pad_w_right, // width padding (dim 0) + pad_h_top, pad_h_bottom, // height padding (dim 1) + 0, 0, // no channel padding (dim 2) + 0, 0); // no batch padding (dim 3) + } + + return inp; +} + + +// Edge Residual Block (Stage 0) +ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Expansion Conv (3x3) + if (stride == 2) { + // Case: Downsampling (Block 0) + // Replicates Conv2dSame(kernel=3, stride=2) + cur = pad_same_2d(cur, 3, 3, stride, stride); + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); + } else { + // Case: Normal 3x3 Block (Block 1, 2) + // Replicates Conv2d(kernel=3, stride=1, padding=1) + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); + } + + // BN + Activation + if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w); + cur = ggml_gelu(ctx0, cur); + + // 2. Pointwise Linear Conv (1x1) + // 1x1 Convs usually have padding=0 and stride=1 + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1); + if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w); + + // 3. Residual Connection + // Only apply residual if spatial dimensions and channels match (stride 1) + if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Depthwise Start (Optional) + // NOTE: dw_start always has stride=1 (no downsampling here) + if (block.dw_start_w) { + int k = block.dw_start_w->ne[0]; // 3 or 5 + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); + if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); + } + + // 2. Pointwise Expansion (1x1) + if (block.pw_exp_w) { + // Standard 1x1 conv, pad=0, stride=1 + cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 3. Depthwise Mid (Optional) + // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) + if (block.dw_mid_w) { + int k = block.dw_mid_w->ne[0]; // 3 or 5 + + if (stride > 1) { + // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding + cur = pad_same_2d(cur, k, k, stride, stride); + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0 + } else { + // Case: Stride 1 -> Use Standard Symmetric Padding + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1); + } + + if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 4. Pointwise Projection (1x1) + if (block.pw_proj_w) { + cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w); + } + + // Apply Layer Scaling if present + if (block.layer_scale_w) { + ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, + 1, 1, block.layer_scale_w->ne[0], 1); + + cur = ggml_mul(ctx0, cur, scale_w_reshaped); + } + + // 5. Residual Connection + bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); + bool same_channel = (inp->ne[2] == cur->ne[2]); + if (same_spatial && same_channel) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// MobileNetV5 Builder (Gemma 3n) - Attention Block +ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { + ggml_tensor * cur = inp; + + // --- Norm --- + if (block.attn_norm_w) { + cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); + } + + // --- 1. Q Calculation --- + ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); + + // --- 2. K Calculation (Downsampled) --- + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * k_inp = cur; + if (block.attn_k_dw_w) { + int k_size = block.attn_k_dw_w->ne[0]; // Usually 3 + k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding + k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_k_norm_w) { + k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f); + } + } + ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); + + // --- 3. V Calculation (Downsampled) --- + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * v_inp = cur; + if (block.attn_v_dw_w) { + int v_size = block.attn_v_dw_w->ne[0]; // Usually 3 + v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding + v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_v_norm_w) { + v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f); + } + } + ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); + + const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; + const int D = k->ne[2]; // Head dimension + const int n_head = q->ne[2] / D; + const int N = W * H; + + // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B] + q = ggml_reshape_3d(ctx0, q, N, D*n_head, B); + q = ggml_reshape_4d(ctx0, q, N, D, n_head, B); + q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B] + q = ggml_cont(ctx0, q); + + const int Wk = k->ne[0]; const int Hk = k->ne[1]; + const int M = Wk * Hk; + + // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] + k = ggml_reshape_3d(ctx0, k, M, D, B); + k = ggml_reshape_4d(ctx0, k, M, D, 1, B); + k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B] + k = ggml_cont(ctx0, k); + + // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] + v = ggml_reshape_3d(ctx0, v, M, D, B); + v = ggml_reshape_4d(ctx0, v, M, D, 1, B); + v = ggml_cont(ctx0, v); // [M, D, 1, B] + + // --- Multi-Query Attention --- + float scale = 1.0f / sqrtf((float)D); + + // Step 1: Compute Q @ K.T + ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); + + scores = ggml_scale(ctx0, scores, scale); + + scores = ggml_soft_max(ctx0, scores); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); + + kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); + kqv = ggml_cont(ctx0, kqv); + + + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); + kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); + kqv = ggml_cont(ctx0, kqv); + + // Output projection + cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); + + // --- Residual & Layer Scale (FIXED) --- + if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { + if (block.layer_scale_w) { + ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, + 1, 1, block.layer_scale_w->ne[0], 1); + cur = ggml_mul(ctx0, cur, scale_w_reshaped); + } + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + ggml_cgraph * clip_graph_mobilenetv5::build() { fprintf(stderr, "\n--- START build_mobilenetv5 ---\n"); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 3875285fe92..54664d10ce3 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -70,4 +70,32 @@ struct clip_graph_glm4v : clip_graph { struct clip_graph_mobilenetv5 : clip_graph { clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + + ggml_tensor * rms_norm_2d( + ggml_tensor * inp, + ggml_tensor * weight, + float eps = 1e-6f); + + ggml_tensor* pad_same_2d( + ggml_tensor* inp, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int dilation_h = 1, + int dilation_w = 1); + + ggml_tensor * build_edge_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_inverted_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_mobilenet_attn( + ggml_tensor * inp, + const mobilenetv5_block & block); }; From 86618c7c0a0a3aff2aa12294fb17b2ad15610c29 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Mon, 22 Dec 2025 13:45:24 +0000 Subject: [PATCH 08/20] Remove obsolete comments --- tools/mtmd/clip.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index e86a09bb5c1..dd778ea3c96 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1536,8 +1536,6 @@ struct clip_model_loader { model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false); model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false); - // IMPORTANT: Your GGUF log shows 'v.enc.msfa.norm.weight' -> shape {2048} - // Ensure TN_MNV5_MSFA_NORM matches this string model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false); // Dynamically load blocks stage by stage @@ -1620,8 +1618,6 @@ struct clip_model_loader { // Load projection weights (similar to Gemma3) model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); - // model.mm_post_proj_norm_w = get_tensor(TN_MM_POST_PROJ_N); // CRITICAL: Post projection norm - // Load additional Gemma3n projection tensors model.mm_0_w = get_tensor("mm.embedding.weight", false); // Input embedding model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false); // Hard embedding norm } break; From e2835e9fbe0870dfb642f54295256515a3fd5471 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Fri, 26 Dec 2025 19:41:05 +0000 Subject: [PATCH 09/20] - convert_hf_to_gguf.py & constants.py & tensor_mapping.py: Use explicit mapping: Custom map for double indexed blocks and tensor_mapping.py for rest - convert_hf_to_gguf.py: Unsqueeze Stem Bias and Layer scale tensors to correct shape while converting to gguf - mobilenetv5.cpp: Remove explicit reshaping of Stem Bias and Layer scale which are now handled while converting to gguf, replace fprintf with LOG_* - clip.cpp: Remove unused embedding and hard_emb_norm tensor loading --- convert_hf_to_gguf.py | 78 ++++++++++++++++++++----------- gguf-py/gguf/constants.py | 33 ++++++++++--- gguf-py/gguf/tensor_mapping.py | 42 +++++++++++------ tools/mtmd/clip.cpp | 3 -- tools/mtmd/models/mobilenetv5.cpp | 42 +++++++---------- 5 files changed, 122 insertions(+), 76 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 55e82fe9128..abd65101b52 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5968,6 +5968,32 @@ class Gemma3nVisionModel(MmprojModel): """Vision encoder converter for Gemma3n using MobileNetV5 architecture""" n_block_keys = [] + # Double indexed mapping for MobileNetV5 blocks + block_tensor_mapping = { + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.enc.blocks.{bid}.{sid}.conv_exp.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.enc.blocks.{bid}.{sid}.bn1.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.enc.blocks.{bid}.{sid}.conv_pwl.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.enc.blocks.{bid}.{sid}.bn2.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.enc.blocks.{bid}.{sid}.dw_start.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.enc.blocks.{bid}.{sid}.dw_start.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.enc.blocks.{bid}.{sid}.dw_mid.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.enc.blocks.{bid}.{sid}.dw_mid.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.enc.blocks.{bid}.{sid}.pw_exp.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.enc.blocks.{bid}.{sid}.pw_exp.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.enc.blocks.{bid}.{sid}.pw_proj.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.enc.blocks.{bid}.{sid}.pw_proj.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.enc.blocks.{bid}.{sid}.layer_scale.gamma", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.query.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.key.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.value.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.output.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.key.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.enc.blocks.{bid}.{sid}.attn.key.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.value.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.enc.blocks.{bid}.{sid}.attn.value.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.enc.blocks.{bid}.{sid}.norm.weight", + } + def find_hparam(self, keys: list[str], optional: bool = False) -> Any: """Override to return 0 for block count since MobileNetV5 is CNN-based""" if not keys: # If n_block_keys is empty (our case) @@ -6027,10 +6053,23 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) + def custom_map(self, name: str) -> str: + """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping.""" + parts = name.split(".") + # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix + if len(parts) >= 7: + bid, sid = parts[4], parts[5] + suffix = ".".join(parts[6:]) + template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}" + if template in self.block_tensor_mapping: + return self.block_tensor_mapping[template].format(bid=bid, sid=sid) + + raise ValueError(f"Unknown name: {name}") + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # Gemma3n uses different prefixes than other models: + # Gemma3n uses # - model.embed_vision.* for projection layers # - model.vision_tower.* for vision encoder # Skip non-vision tensors @@ -6038,34 +6077,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name.startswith("model.vision_tower.")): return [] - # Strip "model." prefix to match expected llama.cpp format - if name.startswith("model."): - name = name[6:] # Remove "model." prefix - - # Process MobileNetV5 and projection tensors - name = name.replace("_weight", ".weight") - - # Rename embed_vision to match our C++ implementation expectations - name = name.replace("embed_vision.", "") - - # Rename vision_tower.timm_model to vision_tower for cleaner naming - name = name.replace("vision_tower.timm_model.", "vision_tower.") - - # Handle normalization layer naming - name = name.replace("hard_embedding_norm", "hard_emb_norm") - name = name.replace("soft_embedding_norm", "soft_emb_norm") - - # Gemma3n uses Gemma3p5RMSNorm which has scale_shift=0, so no correction needed - # Unlike Gemma3 which uses Gemma3RMSNorm with scale_shift=1 - if "soft_emb_norm.weight" in name: - # No correction needed for Gemma3n - pass - - if name.startswith("vision_tower."): - tensor_suffix = name[13:] - return [(f"v.enc.{tensor_suffix}", data_torch)] + if name.startswith("model.vision_tower.timm_model.blocks."): + # Double-indexed block tensors through custom logic + new_name = self.custom_map(name) else: - return [(self.map_tensor_name(name), data_torch)] + # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py + new_name = self.map_tensor_name(name) + + if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): + data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] + + yield (new_name, data_torch) @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 869a8582b12..975a99a61a0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -669,7 +669,14 @@ class MODEL_TENSOR(IntEnum): V_MM_SOFT_EMB_NORM = auto() # gemma3 V_MM_EMBEDDING = auto() # gemma3n V_MM_HARD_EMB_NORM = auto() # gemma3n - V_MM_POST_PROJ_NORM = auto() # gemma3n + V_ENC_CONV_STEM = auto() # gemma3n + V_ENC_CONV_STEM_BIAS = auto() # gemma3n + V_ENC_CONV_STEM_NORM = auto() # gemma3n + V_ENC_MSFA_EXP = auto() # gemma3n + V_ENC_MSFA_EXP_NORM = auto() # gemma3n + V_ENC_MSFA_PROJ = auto() # gemma3n + V_ENC_MSFA_PROJ_NORM = auto() # gemma3n + V_ENC_MSFA_NORM = auto() # gemma3n V_RESMPL_POS_EMBD_K = auto() # minicpmv V_RESMPL_ATTN_Q = auto() # minicpmv V_RESMPL_ATTN_K = auto() # minicpmv @@ -1061,10 +1068,17 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", - MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", - MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", - MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", - MODEL_TENSOR.V_MM_POST_PROJ_NORM: "mm.post_proj_norm", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n + MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n + MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM: "v.enc.conv_stem.conv", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.enc.conv_stem.conv_bias", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.enc.conv_stem.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP: "v.enc.msfa.ffn.pw_exp.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.enc.msfa.ffn.pw_exp.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.enc.msfa.ffn.pw_proj.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.enc.msfa.ffn.pw_proj.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_NORM: "v.enc.msfa.norm", # gemma3n MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", @@ -1165,7 +1179,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_SOFT_EMB_NORM, MODEL_TENSOR.V_MM_EMBEDDING, MODEL_TENSOR.V_MM_HARD_EMB_NORM, - MODEL_TENSOR.V_MM_POST_PROJ_NORM, + MODEL_TENSOR.V_ENC_CONV_STEM, + MODEL_TENSOR.V_ENC_CONV_STEM_BIAS, + MODEL_TENSOR.V_ENC_CONV_STEM_NORM, + MODEL_TENSOR.V_ENC_MSFA_EXP, + MODEL_TENSOR.V_ENC_MSFA_EXP_NORM, + MODEL_TENSOR.V_ENC_MSFA_PROJ, + MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM, + MODEL_TENSOR.V_ENC_MSFA_NORM, MODEL_TENSOR.V_RESMPL_POS_EMBD_K, MODEL_TENSOR.V_RESMPL_ATTN_Q, MODEL_TENSOR.V_RESMPL_ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 3e1cf8a136f..9b17cb1ef7d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -120,25 +120,41 @@ class TensorNameMap: "backbone.embed", # roberta ), - # Vision multimodal projector tensors (non-block) for gemma3n + MODEL_TENSOR.V_MM_EMBEDDING: ( + "model.embed_vision.embedding", # gemma3n + ), + MODEL_TENSOR.V_MM_HARD_EMB_NORM: ( + "model.embed_vision.hard_embedding_norm", # gemma3n + ), MODEL_TENSOR.V_MM_INP_PROJ: ( - "embedding_projection", # gemma3n + "model.embed_vision.embedding_projection", # gemma3n ), - MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( - "soft_emb_norm", # gemma3n + "model.embed_vision.soft_embedding_norm", # gemma3n ), - - MODEL_TENSOR.V_MM_EMBEDDING: ( - "embedding", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM: ( + "model.vision_tower.timm_model.conv_stem.conv", # gemma3n ), - - MODEL_TENSOR.V_MM_HARD_EMB_NORM: ( - "hard_emb_norm", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: ( + "model.vision_tower.timm_model.conv_stem.conv.bias", # gemma3n ), - - MODEL_TENSOR.V_MM_POST_PROJ_NORM: ( - "post_proj_norm", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_NORM: ( + "model.vision_tower.timm_model.conv_stem.bn", # gemma3n + ), + MODEL_TENSOR.V_ENC_MSFA_EXP: ( + "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n + ), + MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: ( + "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n + ), + MODEL_TENSOR.V_ENC_MSFA_PROJ: ( + "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n + ), + MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: ( + "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n + ), + MODEL_TENSOR.V_ENC_MSFA_NORM: ( + "model.vision_tower.timm_model.msfa.norm", # gemma3n ), } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index dd778ea3c96..d54b893b61f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1615,11 +1615,8 @@ struct clip_model_loader { LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1); } } - // Load projection weights (similar to Gemma3) model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); - model.mm_0_w = get_tensor("mm.embedding.weight", false); // Input embedding - model.mm_1_w = get_tensor("mm.hard_emb_norm.weight", false); // Hard embedding norm } break; case PROJECTOR_TYPE_IDEFICS3: { diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index bc1185c10eb..1a5c61fb581 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -1,6 +1,6 @@ #include "models.h" -// --- Helpers for MobileNetV5 Blocks --- +// Helpers for MobileNetV5 Blocks // RMS Norm 2D - normalizes over channels for each spatial position ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { // inp: [W, H, C, B] @@ -19,7 +19,7 @@ ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor return cur; } -// Helper for Conv2dSame padding (asymmetric SAME padding like PyTorch/TF) +// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { const int64_t ih = inp->ne[1]; // height const int64_t iw = inp->ne[0]; // width @@ -87,6 +87,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, con return cur; } +// Universal Inverted Residual Block (Stage 1+) ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { ggml_tensor * cur = inp; @@ -133,11 +134,8 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, } // Apply Layer Scaling if present - if (block.layer_scale_w) { - ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, - 1, 1, block.layer_scale_w->ne[0], 1); - - cur = ggml_mul(ctx0, cur, scale_w_reshaped); + if (block.layer_scale_w) { + cur = ggml_mul(ctx0, cur, block.layer_scale_w); } // 5. Residual Connection @@ -150,19 +148,19 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, return cur; } -// MobileNetV5 Builder (Gemma 3n) - Attention Block +// Attention Block (MQA) ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { ggml_tensor * cur = inp; - // --- Norm --- + // Norm if (block.attn_norm_w) { cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); } - // --- 1. Q Calculation --- + // 1. Q Calculation ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); - // --- 2. K Calculation (Downsampled) --- + // 2. K Calculation (Downsampled) // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) ggml_tensor * k_inp = cur; if (block.attn_k_dw_w) { @@ -175,7 +173,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co } ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); - // --- 3. V Calculation (Downsampled) --- + // 3. V Calculation (Downsampled) // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) ggml_tensor * v_inp = cur; if (block.attn_v_dw_w) { @@ -213,7 +211,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co v = ggml_reshape_4d(ctx0, v, M, D, 1, B); v = ggml_cont(ctx0, v); // [M, D, 1, B] - // --- Multi-Query Attention --- + // Multi-Query Attention float scale = 1.0f / sqrtf((float)D); // Step 1: Compute Q @ K.T @@ -236,12 +234,10 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co // Output projection cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); - // --- Residual & Layer Scale (FIXED) --- + // Residual & Layer Scale if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { if (block.layer_scale_w) { - ggml_tensor * scale_w_reshaped = ggml_reshape_4d(ctx0, block.layer_scale_w, - 1, 1, block.layer_scale_w->ne[0], 1); - cur = ggml_mul(ctx0, cur, scale_w_reshaped); + cur = ggml_mul(ctx0, cur, block.layer_scale_w); } cur = ggml_add(ctx0, cur, inp); } @@ -250,9 +246,6 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co } ggml_cgraph * clip_graph_mobilenetv5::build() { - - fprintf(stderr, "\n--- START build_mobilenetv5 ---\n"); - ggml_tensor * inp = build_inp_raw(); // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2)) @@ -260,9 +253,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0 if (model.mobilenet_stem_conv_b) { - // Bias is [C, 1, 1, 1], need to reshape to [1, 1, C, 1] for broadcasting to [W, H, C, B] - ggml_tensor * bias = ggml_reshape_4d(ctx0, model.mobilenet_stem_conv_b, 1, 1, cur->ne[2], 1); - cur = ggml_add(ctx0, cur, bias); + cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b); } if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w); cur = ggml_gelu(ctx0, cur); @@ -332,7 +323,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // Safety check for non-integer scaling if strictly replicating if (high_res_w % feat_w != 0) { - fprintf(stderr, "Warning: Non-integer scaling detected in MSFA\n"); + LOG_WRN("%s: non-integer scaling detected\n", __func__); } // Upsample (Nearest Neighbor) @@ -388,7 +379,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // Avg Pool: Kernel=s, Stride=s cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); } else { - fprintf(stderr, "Error: Irregular downsampling stride required.\n"); + LOG_ERR("%s: irregular downsampling stride required\n", __func__); } } @@ -418,7 +409,6 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 2. FEATURE SCALING // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5 - // This prevents the signal from vanishing during the subsequent RMSNorm. const float scale_factor = sqrtf((float)C); cur = ggml_scale(ctx0, cur, scale_factor); From 632e29f55152b7e623e0a5016c8391ce2696bad3 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Fri, 26 Dec 2025 20:02:21 +0000 Subject: [PATCH 10/20] - Rename tensors to v.conv..., v.blk..., v.msfa... to better align with already existing terminology --- convert_hf_to_gguf.py | 44 ++++++++++++++-------------- gguf-py/gguf/constants.py | 16 +++++------ tools/mtmd/clip-impl.h | 60 +++++++++++++++++++-------------------- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index abd65101b52..d5d52b8bf17 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5970,28 +5970,28 @@ class Gemma3nVisionModel(MmprojModel): # Double indexed mapping for MobileNetV5 blocks block_tensor_mapping = { - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.enc.blocks.{bid}.{sid}.conv_exp.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.enc.blocks.{bid}.{sid}.bn1.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.enc.blocks.{bid}.{sid}.conv_pwl.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.enc.blocks.{bid}.{sid}.bn2.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.enc.blocks.{bid}.{sid}.dw_start.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.enc.blocks.{bid}.{sid}.dw_start.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.enc.blocks.{bid}.{sid}.dw_mid.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.enc.blocks.{bid}.{sid}.dw_mid.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.enc.blocks.{bid}.{sid}.pw_exp.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.enc.blocks.{bid}.{sid}.pw_exp.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.enc.blocks.{bid}.{sid}.pw_proj.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.enc.blocks.{bid}.{sid}.pw_proj.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.enc.blocks.{bid}.{sid}.layer_scale.gamma", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.query.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.key.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.value.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.enc.blocks.{bid}.{sid}.attn.output.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.key.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.enc.blocks.{bid}.{sid}.attn.key.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.enc.blocks.{bid}.{sid}.attn.value.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.enc.blocks.{bid}.{sid}.attn.value.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.enc.blocks.{bid}.{sid}.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", } def find_hparam(self, keys: list[str], optional: bool = False) -> Any: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 975a99a61a0..40bb79400b0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1071,14 +1071,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM: "v.enc.conv_stem.conv", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.enc.conv_stem.conv_bias", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.enc.conv_stem.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_EXP: "v.enc.msfa.ffn.pw_exp.conv", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.enc.msfa.ffn.pw_exp.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.enc.msfa.ffn.pw_proj.conv", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.enc.msfa.ffn.pw_proj.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_NORM: "v.enc.msfa.norm", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.conv_stem.conv_bias", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 24a1ef52d08..ce312711e5e 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -154,44 +154,44 @@ #define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" // mobilenetv5 (gemma3n) definitions -#define TN_MNV5_STEM_CONV "v.enc.conv_stem.conv.weight" -#define TN_MNV5_STEM_BIAS "v.enc.conv_stem.conv.bias" -#define TN_MNV5_STEM_BN "v.enc.conv_stem.bn.weight" +#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight" +#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias" +#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight" // Stage 0 Block (Edge Residual) -#define TN_MNV5_BLK_S0_EXP_W "v.enc.blocks.%d.%d.conv_exp.weight" -#define TN_MNV5_BLK_S0_BN1_W "v.enc.blocks.%d.%d.bn1.weight" -#define TN_MNV5_BLK_S0_PWL_W "v.enc.blocks.%d.%d.conv_pwl.weight" -#define TN_MNV5_BLK_S0_BN2_W "v.enc.blocks.%d.%d.bn2.weight" +#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight" +#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight" +#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight" +#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight" // Stage 1+ Block (Universal Inverted Residual) -#define TN_MNV5_BLK_DW_START_W "v.enc.blocks.%d.%d.dw_start.conv.weight" -#define TN_MNV5_BLK_DW_START_BN "v.enc.blocks.%d.%d.dw_start.bn.weight" -#define TN_MNV5_BLK_DW_MID_W "v.enc.blocks.%d.%d.dw_mid.conv.weight" -#define TN_MNV5_BLK_DW_MID_BN "v.enc.blocks.%d.%d.dw_mid.bn.weight" -#define TN_MNV5_BLK_PW_EXP_W "v.enc.blocks.%d.%d.pw_exp.conv.weight" -#define TN_MNV5_BLK_PW_EXP_BN "v.enc.blocks.%d.%d.pw_exp.bn.weight" -#define TN_MNV5_BLK_PW_PROJ_W "v.enc.blocks.%d.%d.pw_proj.conv.weight" -#define TN_MNV5_BLK_PW_PROJ_BN "v.enc.blocks.%d.%d.pw_proj.bn.weight" -#define TN_MNV5_BLK_LAYER_SCALE "v.enc.blocks.%d.%d.layer_scale.gamma" +#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight" +#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight" +#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight" +#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight" +#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight" +#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight" +#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight" +#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight" +#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma" // Attention Components -#define TN_MNV5_ATTN_Q_W "v.enc.blocks.%d.%d.attn.query.proj.weight" -#define TN_MNV5_ATTN_K_W "v.enc.blocks.%d.%d.attn.key.proj.weight" -#define TN_MNV5_ATTN_V_W "v.enc.blocks.%d.%d.attn.value.proj.weight" -#define TN_MNV5_ATTN_O_W "v.enc.blocks.%d.%d.attn.output.proj.weight" -#define TN_MNV5_ATTN_K_DW "v.enc.blocks.%d.%d.attn.key.down_conv.weight" -#define TN_MNV5_ATTN_K_NORM "v.enc.blocks.%d.%d.attn.key.norm.weight" -#define TN_MNV5_ATTN_V_DW "v.enc.blocks.%d.%d.attn.value.down_conv.weight" -#define TN_MNV5_ATTN_V_NORM "v.enc.blocks.%d.%d.attn.value.norm.weight" -#define TN_MNV5_ATTN_NORM "v.enc.blocks.%d.%d.norm.weight" // Block norm used in attn blocks +#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight" +#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight" +#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight" +#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight" +#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight" +#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight" +#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight" +#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight" +#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks // MSFA -#define TN_MNV5_MSFA_FFN_EXP_W "v.enc.msfa.ffn.pw_exp.conv.weight" -#define TN_MNV5_MSFA_FFN_EXP_BN "v.enc.msfa.ffn.pw_exp.bn.weight" -#define TN_MNV5_MSFA_FFN_PROJ_W "v.enc.msfa.ffn.pw_proj.conv.weight" -#define TN_MNV5_MSFA_FFN_PROJ_BN "v.enc.msfa.ffn.pw_proj.bn.weight" -#define TN_MNV5_MSFA_NORM "v.enc.msfa.norm.weight" +#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight" +#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight" +#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight" +#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight" +#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight" // align x to upper multiple of n From d37c22b2c5dd3e551e0e18e1061c2d89d1e8f8ff Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Fri, 26 Dec 2025 20:53:11 +0000 Subject: [PATCH 11/20] Fix stem conv bias name --- gguf-py/gguf/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 40bb79400b0..962e66b5c23 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1072,7 +1072,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.conv_stem.conv_bias", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.conv_stem.conv.bias", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n From 58667f506cf1e3f0c14b4bf21042c1b1506202eb Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sat, 27 Dec 2025 18:56:24 +0000 Subject: [PATCH 12/20] Remove explicit handling of bias term for stem conv --- gguf-py/gguf/constants.py | 3 --- gguf-py/gguf/tensor_mapping.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 962e66b5c23..984ff4fb11a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -670,7 +670,6 @@ class MODEL_TENSOR(IntEnum): V_MM_EMBEDDING = auto() # gemma3n V_MM_HARD_EMB_NORM = auto() # gemma3n V_ENC_CONV_STEM = auto() # gemma3n - V_ENC_CONV_STEM_BIAS = auto() # gemma3n V_ENC_CONV_STEM_NORM = auto() # gemma3n V_ENC_MSFA_EXP = auto() # gemma3n V_ENC_MSFA_EXP_NORM = auto() # gemma3n @@ -1072,7 +1071,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: "v.conv_stem.conv.bias", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n @@ -1180,7 +1178,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_EMBEDDING, MODEL_TENSOR.V_MM_HARD_EMB_NORM, MODEL_TENSOR.V_ENC_CONV_STEM, - MODEL_TENSOR.V_ENC_CONV_STEM_BIAS, MODEL_TENSOR.V_ENC_CONV_STEM_NORM, MODEL_TENSOR.V_ENC_MSFA_EXP, MODEL_TENSOR.V_ENC_MSFA_EXP_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 9b17cb1ef7d..5e35134546f 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -135,9 +135,6 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_CONV_STEM: ( "model.vision_tower.timm_model.conv_stem.conv", # gemma3n ), - MODEL_TENSOR.V_ENC_CONV_STEM_BIAS: ( - "model.vision_tower.timm_model.conv_stem.conv.bias", # gemma3n - ), MODEL_TENSOR.V_ENC_CONV_STEM_NORM: ( "model.vision_tower.timm_model.conv_stem.bn", # gemma3n ), From 47b7dd13462793487c316d1401ce54917f5ca038 Mon Sep 17 00:00:00 2001 From: Simranjeet Singh Date: Sat, 27 Dec 2025 19:00:54 +0000 Subject: [PATCH 13/20] - Change order of addition in "project_per_layer_inputs" to support broadcasting of vision inp_per_layer - Simplify the vision embeddings path of "get_per_layer_inputs" to output [n_embd_altup, n_layer, 1], broadcastable --- src/models/gemma3n-iswa.cpp | 54 +++++++------------------------------ 1 file changed, 10 insertions(+), 44 deletions(-) diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index e172b9a79f8..39633dc3504 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -248,9 +248,9 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) { // equivalent to get_per_layer_inputs() in python code // output shape: [n_embd_altup, n_layer, n_tokens] ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { - auto inp = std::make_unique(); ggml_tensor * inp_per_layer; if (ubatch.token) { + auto inp = std::make_unique(); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); ggml_set_input(inp->tokens); res->t_tokens = inp->tokens; @@ -258,54 +258,20 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() { inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); cb(inp_per_layer, "inp_per_layer_selected", -1); + res->add_input(std::move(inp)); } else { - // For embedding inputs (e.g., from vision encoder) - // Vision tokens should use the padding token (ID=0) embedding - // from tok_embd_per_layer, NOT project the vision embeddings. - // The projection happens later in project_per_layer_inputs(). - // This matches PyTorch behavior: - // per_layer_inputs_tokens = torch.where(mask, input_ids, torch.zeros_like(input_ids)) - // per_layer_inputs = EmbedPerLayer(per_layer_inputs_tokens) # Uses padding (0) for vision - - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - ggml_set_input(inp->embd); - - // tok_embd_per_layer shape: [embd_size, vocab_size] where embd_size = n_embd_altup * n_layer + // Vision embedding path: use padding token (ID=0) embedding const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer - // Create zeros tensor [embd_size, n_tokens] by projecting vision embeddings and multiplying by 0 - // First, project inp->embd [n_embd, n_tokens] to per-layer space [embd_size, n_tokens] - ggml_tensor * zeros_per_layer = ggml_mul_mat(ctx0, model.per_layer_model_proj, inp->embd); - zeros_per_layer = ggml_scale(ctx0, zeros_per_layer, 0.0f); // Multiply by 0 to get zeros - ggml_set_name(zeros_per_layer, "zeros_per_layer"); + // Extract and dequantize padding token embedding (column 0) + ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0); + ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size); + inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32); - // Extract column 0 (padding token's embedding) as a vector: [embd_size] - // Note: tok_embd_per_layer is quantized (q8_0), so the view is also q8_0 - ggml_tensor * padding_embd_vec_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, - embd_size, // number of elements - 0); // offset (column 0) - ggml_set_name(padding_embd_vec_q, "padding_token_emb_q8"); - - // Dequantize to f32 using ggml_cpy - ggml_tensor * padding_embd_vec_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size); - ggml_tensor * padding_embd_vec = ggml_cpy(ctx0, padding_embd_vec_q, padding_embd_vec_f32); - ggml_set_name(padding_embd_vec, "padding_token_emb_f32"); - - // Reshape to [embd_size, 1] for broadcasting - ggml_tensor * padding_embd_col = ggml_reshape_2d(ctx0, padding_embd_vec, embd_size, 1); - - // Add: zeros [embd_size, n_tokens] + padding [embd_size, 1] = broadcasted padding [embd_size, n_tokens] - ggml_tensor * inp_per_layer_flat = ggml_add(ctx0, zeros_per_layer, padding_embd_col); - ggml_set_name(inp_per_layer_flat, "inp_per_layer_broadcasted"); - - // Reshape to [n_embd_altup, n_layer, n_tokens] for per-layer processing - inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer_flat, n_embd_altup, n_layer, n_tokens); - - // Apply same scaling as text tokens - // inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); + // Reshape to [n_embd_altup, n_layer, 1] + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1); cb(inp_per_layer, "inp_per_layer_vision", -1); } - res->add_input(std::move(inp)); return inp_per_layer; } @@ -323,7 +289,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp -1); // [n_embd_altup, n_layer, n_tokens] cb(per_layer_proj, "per_layer_proj", -1); - inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj); + inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer); inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale); cb(inp_per_layer, "inp_per_layer", -1); From eea58817f5d40a064d5536ae7b9616eede3b62cf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 17:13:24 +0100 Subject: [PATCH 14/20] clean up conversion script --- convert_hf_to_gguf.py | 53 ++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d7df9f2d88a..4f79ceb77c7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6041,12 +6041,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors + @ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel") class Gemma3nVisionModel(MmprojModel): """Vision encoder converter for Gemma3n using MobileNetV5 architecture""" - n_block_keys = [] - # Double indexed mapping for MobileNetV5 blocks + # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) + # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py block_tensor_mapping = { "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", @@ -6072,39 +6073,24 @@ class Gemma3nVisionModel(MmprojModel): "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", } - def find_hparam(self, keys: list[str], optional: bool = False) -> Any: - """Override to return 0 for block count since MobileNetV5 is CNN-based""" - if not keys: # If n_block_keys is empty (our case) + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + # force n_layers to 0 in __init__() + # we have to do this because self.hparams_vision is not yet accessible for modification inside __init__() + if "n_layers" in list(keys): return 0 - # Otherwise use parent implementation return super().find_hparam(keys, optional) def __init__(self, *args, **kwargs): # Parent init will call find_hparam which now returns 0 for empty keys super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["n_layers"] = 0 + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size", 2048) * 4 + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) - def find_vparam(self, keys: list[str], optional: bool = False) -> Any: - """Override to provide hardcoded MobileNetV5 parameters that aren't in config""" - # Handle empty keys list (n_block_keys) - return 0 for CNN architecture - if not keys: - return 0 - - if "intermediate_size" in keys: - # Typical expansion is 4x the embedding dimension - hidden_size = self.hparams_vision.get("hidden_size", 2048) - return hidden_size * 4 - - if "num_attention_heads" in keys or "num_heads" in keys: - # Multi-Query Attention with 8 heads - return 8 - - # For other parameters, use parent implementation - return super().find_vparam(keys, optional) - - def set_gguf_parameters(self): - # MobileNetV5 does not use normalisation at all - self.preprocessor_config["image_mean"] = [0.0 , 0.0 , 0.0 ] - self.preprocessor_config["image_std"] = [1.0 , 1.0 , 1.0 ] + # MobileNetV5 does not use image_mean/std + self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0] + self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0] self.hparams_vision["image_size"] = self.preprocessor_config.get( "size", {"height": 768, "width": 768} )["height"] @@ -6114,13 +6100,9 @@ def set_gguf_parameters(self): image_size = self.hparams_vision["image_size"] self.hparams_vision["patch_size"] = image_size // image_seq_length - # Now call parent which will use the corrected values + def set_gguf_parameters(self): super().set_gguf_parameters() - - # Set projector type to GEMMA3N self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N) - - # MobileNetV5 specific parameters self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) def tensor_force_quant(self, name, new_name, bid, n_dims): @@ -6151,8 +6133,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # - model.embed_vision.* for projection layers # - model.vision_tower.* for vision encoder # Skip non-vision tensors - if not (name.startswith("model.embed_vision.") or - name.startswith("model.vision_tower.")): + if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): return [] if name.startswith("model.vision_tower.timm_model.blocks."): @@ -6161,7 +6142,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py new_name = self.map_tensor_name(name) - + if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] From bfbb3158f1c63ff9803960124cf7a3d13a222bcd Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 17:16:17 +0100 Subject: [PATCH 15/20] fix code style --- tools/mtmd/clip-model.h | 8 +++--- tools/mtmd/clip.cpp | 8 +++--- tools/mtmd/models/mobilenetv5.cpp | 46 +++++++++++++++---------------- tools/mtmd/models/models.h | 26 ++++++++--------- tools/mtmd/mtmd.cpp | 2 +- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 3264d759f62..d4ff9151bb0 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -184,7 +184,7 @@ struct mobilenetv5_block { // Stage 1+ (Universal Inverted Residual) ggml_tensor * dw_start_w = nullptr; ggml_tensor * dw_start_bn_w = nullptr; - + ggml_tensor * pw_exp_w = nullptr; ggml_tensor * pw_exp_bn_w = nullptr; @@ -201,13 +201,13 @@ struct mobilenetv5_block { ggml_tensor * attn_k_w = nullptr; ggml_tensor * attn_v_w = nullptr; ggml_tensor * attn_o_w = nullptr; - + // Optional downsampling/norm in attention ggml_tensor * attn_k_dw_w = nullptr; ggml_tensor * attn_k_norm_w = nullptr; ggml_tensor * attn_v_dw_w = nullptr; ggml_tensor * attn_v_norm_w = nullptr; - + // Block norm (often present in attention blocks) ggml_tensor * attn_norm_w = nullptr; }; @@ -342,7 +342,7 @@ struct clip_model { ggml_tensor * msfa_ffn_expand_w = nullptr; ggml_tensor * msfa_ffn_project_w = nullptr; ggml_tensor * msfa_ffn_expand_bn = nullptr; - ggml_tensor * msfa_ffn_project_bn = nullptr; + ggml_tensor * msfa_ffn_project_bn = nullptr; // pixtral, glm4v diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 04109a07786..f1b74d866f2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1570,13 +1570,13 @@ struct clip_model_loader { model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false); model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false); - + model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false); // Dynamically load blocks stage by stage for (int stage = 0; stage < 4; ++stage) { int blocks_found_in_stage = 0; - + for (int blk_idx = 0; ; ++blk_idx) { bool found_block = false; mobilenetv5_block block; @@ -1588,7 +1588,7 @@ struct clip_model_loader { block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false); block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false); block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false); - } + } // 2. Check for UIR (Universal Inverted Residual) else { // Check for dw_start OR pw_exp (some UIR blocks skip dw_start) @@ -1643,7 +1643,7 @@ struct clip_model_loader { break; } } - + // Track where this stage ends in the flat vector if (blocks_found_in_stage > 0) { model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1); diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 1a5c61fb581..78db081ea32 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -8,7 +8,7 @@ ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); cur = ggml_cont(ctx0, cur); cur = ggml_rms_norm(ctx0, cur, eps); - + if (weight) { cur = ggml_mul(ctx0, cur, weight); } @@ -61,7 +61,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, con if (stride == 2) { // Case: Downsampling (Block 0) // Replicates Conv2dSame(kernel=3, stride=2) - cur = pad_same_2d(cur, 3, 3, stride, stride); + cur = pad_same_2d(cur, 3, 3, stride, stride); cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); } else { // Case: Normal 3x3 Block (Block 1, 2) @@ -112,7 +112,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) if (block.dw_mid_w) { int k = block.dw_mid_w->ne[0]; // 3 or 5 - + if (stride > 1) { // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding cur = pad_same_2d(cur, k, k, stride, stride); @@ -134,7 +134,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, } // Apply Layer Scaling if present - if (block.layer_scale_w) { + if (block.layer_scale_w) { cur = ggml_mul(ctx0, cur, block.layer_scale_w); } @@ -148,7 +148,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, return cur; } -// Attention Block (MQA) +// Attention Block (MQA) ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { ggml_tensor * cur = inp; @@ -198,7 +198,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co q = ggml_cont(ctx0, q); const int Wk = k->ne[0]; const int Hk = k->ne[1]; - const int M = Wk * Hk; + const int M = Wk * Hk; // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] k = ggml_reshape_3d(ctx0, k, M, D, B); @@ -225,7 +225,7 @@ ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, co kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); kqv = ggml_cont(ctx0, kqv); - + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); @@ -262,7 +262,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 2. Blocks std::vector intermediate_features; const int total_blocks = model.mobilenet_blocks.size(); - + auto is_stage_start = [&](int i) { if (i == 0) return true; for (int end_idx : model.mobilenet_stage_ends) { @@ -297,7 +297,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // 3. Multi-Scale Fusion Adapter (MSFA) if (!intermediate_features.empty()) { - + // A. Reference Resolution: PyTorch implementation uses inputs[0] // We assume intermediate_features[0] is the "High Resolution" target. // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32). @@ -314,21 +314,21 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // PyTorch: if feat_size < high_resolution: interpolate if (feat_w < high_res_w || feat_h < high_res_h) { - // Calculate scale factor. - // Note: PyTorch 'nearest' works on arbitrary float scales. + // Calculate scale factor. + // Note: PyTorch 'nearest' works on arbitrary float scales. // ggml_upscale generally takes integer factors or target sizes depending on helper. // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2). int scale_w = high_res_w / feat_w; // int scale_h = high_res_h / feat_h; - + // Safety check for non-integer scaling if strictly replicating - if (high_res_w % feat_w != 0) { + if (high_res_w % feat_w != 0) { LOG_WRN("%s: non-integer scaling detected\n", __func__); } // Upsample (Nearest Neighbor) // 2 is the scale factor - feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); + feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); } resized_feats.push_back(feat); } @@ -341,16 +341,16 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // D. FFN (UniversalInvertedResidual) // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm - + // 1. Expansion if (model.msfa_ffn_expand_w) { // 1x1 Conv cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1); - + if (model.msfa_ffn_expand_bn) { cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); } - + cur = ggml_gelu(ctx0, cur); } @@ -359,7 +359,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { if (model.msfa_ffn_project_w) { // 1x1 Conv cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1); - + // UniversalInvertedResidual typically has a norm after projection if (model.msfa_ffn_project_bn) { cur = rms_norm_2d(cur, model.msfa_ffn_project_bn); @@ -369,7 +369,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // E. Final Downsample to Target Resolution (Output Resolution) // PyTorch: matches self.output_resolution (e.g. 16x16) - const int target_out_res = 16; + const int target_out_res = 16; int current_w = cur->ne[0]; if (current_w > target_out_res) { @@ -418,8 +418,8 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // We must normalize regardless, then multiply if weight exists. { const float eps = 1e-6f; // Gemma3n uses 1e-6 - cur = ggml_rms_norm(ctx0, cur, eps); - + cur = ggml_rms_norm(ctx0, cur, eps); + if (model.mm_soft_emb_norm_w) { // Weight shape is (2048,) -> Element-wise broadcast multiply cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); @@ -431,7 +431,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] if (model.mm_input_proj_w) { - cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); } // 5. POST PROJECTION NORM @@ -450,4 +450,4 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { ggml_build_forward_expand(gf, cur); return gf; -} \ No newline at end of file +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index bb88d11a8a9..9970980c7bc 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -82,30 +82,30 @@ struct clip_graph_mobilenetv5 : clip_graph { ggml_cgraph * build() override; ggml_tensor * rms_norm_2d( - ggml_tensor * inp, - ggml_tensor * weight, + ggml_tensor * inp, + ggml_tensor * weight, float eps = 1e-6f); - + ggml_tensor* pad_same_2d( - ggml_tensor* inp, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - int dilation_h = 1, + ggml_tensor* inp, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int dilation_h = 1, int dilation_w = 1); - + ggml_tensor * build_edge_residual( ggml_tensor * inp, const mobilenetv5_block & block, int stride); ggml_tensor * build_inverted_residual( - ggml_tensor * inp, - const mobilenetv5_block & block, + ggml_tensor * inp, + const mobilenetv5_block & block, int stride); ggml_tensor * build_mobilenet_attn( - ggml_tensor * inp, + ggml_tensor * inp, const mobilenetv5_block & block); }; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 363873d3ff7..23cc8ffd30d 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -862,7 +862,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { } bool mtmd_decode_use_non_causal(mtmd_context * ctx) { - if (ctx->ctx_v && + if (ctx->ctx_v && (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) { return true; } From 395d2d412bc78d1432bc6bdfe5aff4738ecd37e0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 18:43:54 +0100 Subject: [PATCH 16/20] also preserve audio tensors --- convert_hf_to_gguf.py | 162 ++++++++++++++++++--------------- gguf-py/gguf/constants.py | 39 +++++++- gguf-py/gguf/tensor_mapping.py | 61 +++++++++++++ tools/mtmd/clip.cpp | 11 ++- 4 files changed, 195 insertions(+), 78 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4f79ceb77c7..cefaaa712d9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6042,9 +6042,72 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors -@ModelBase.register("Gemma3nForConditionalGeneration", "Gemma3nVisionModel") -class Gemma3nVisionModel(MmprojModel): - """Vision encoder converter for Gemma3n using MobileNetV5 architecture""" +class ConformerAudioModel(MmprojModel): + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + @staticmethod + def is_audio_tensor(name: str): + return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ConformerAudioModel.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip language model tensors + if name.startswith("lfm."): + return [] + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return [] + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return [] + + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + + if len(self._batch_norm_tensors[bid]) < 5: + return [] + + weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] + bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] + running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] + running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] + eps = 1e-5 # default value + + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + return [ + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), + (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), + ] + + # reshape conv weights + if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): + data_torch = data_torch[:, None, None] + if "conv.depthwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + if "conv.pointwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[2] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) + + return [(self.map_tensor_name(name), data_torch)] + + +@ModelBase.register("Gemma3nForConditionalGeneration") +class Gemma3nVisionAudioModel(ConformerAudioModel): + has_audio_encoder = True + has_vision_encoder = True # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py @@ -6073,19 +6136,12 @@ class Gemma3nVisionModel(MmprojModel): "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", } - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - # force n_layers to 0 in __init__() - # we have to do this because self.hparams_vision is not yet accessible for modification inside __init__() - if "n_layers" in list(keys): - return 0 - return super().find_hparam(keys, optional) - def __init__(self, *args, **kwargs): # Parent init will call find_hparam which now returns 0 for empty keys super().__init__(*args, **kwargs) assert self.hparams_vision is not None - self.hparams_vision["n_layers"] = 0 - self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size", 2048) * 4 + self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4 self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) # MobileNetV5 does not use image_mean/std @@ -6100,11 +6156,25 @@ def __init__(self, *args, **kwargs): image_size = self.hparams_vision["image_size"] self.hparams_vision["patch_size"] = image_size // image_seq_length + # remap audio hparams + assert self.hparams_audio is not None + self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"] + self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"] + self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144) + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N) + + # vision params self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + # audio params + assert self.hparams_audio is not None + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + def tensor_force_quant(self, name, new_name, bid, n_dims): # Force quantization settings for specific tensor types if "input_projection" in name or "input_proj" in name: @@ -6127,7 +6197,9 @@ def custom_map(self, name: str) -> str: raise ValueError(f"Unknown name: {name}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused + if (ConformerAudioModel.is_audio_tensor(name)): + name = name.replace("model.audio_tower.conformer.", "conformer.layers.") + return super().modify_tensors(data_torch, name, bid) # Gemma3n uses # - model.embed_vision.* for projection layers @@ -6146,7 +6218,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - yield (new_name, data_torch) + return [(new_name, data_torch)] @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") @@ -10088,7 +10160,7 @@ def set_gguf_parameters(self): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self._is_vision_tensor(name) or self._is_audio_tensor(name): + if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name): # skip multimodal tensors return [] @@ -10104,9 +10176,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter def _is_vision_tensor(self, name: str) -> bool: return "vision_tower" in name or "multi_modal_projector" in name - def _is_audio_tensor(self, name: str): - return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) - @ModelBase.register("Lfm2Model") class LFM2ColBertModel(LFM2Model): @@ -10234,13 +10303,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Lfm2AudioForConditionalGeneration") -class LFM2AudioModel(MmprojModel): +class LFM2AudioModel(ConformerAudioModel): has_vision_encoder = False has_audio_encoder = True model_name = "Lfm2AudioEncoder" - _batch_norm_tensors: list[dict[str, Tensor]] | None = None - def get_audio_config(self) -> dict[str, Any] | None: return self.global_config.get("encoder") @@ -10254,59 +10321,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip language model tensors - if name.startswith("lfm."): - return [] - - # for training only - if any(p in name for p in ["audio_loss_weight"]): - return [] - - # for audio output - if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return [] - - # fold running_mean, running_var and eps into weight and bias for batch_norm - if "batch_norm" in name: - if self._batch_norm_tensors is None: - self._batch_norm_tensors = [{} for _ in range(self.block_count)] - assert bid is not None - self._batch_norm_tensors[bid][name] = data_torch - - if len(self._batch_norm_tensors[bid]) < 5: - return [] - - weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] - bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] - running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] - running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] - eps = 1e-5 # default value - - a = weight / torch.sqrt(running_var + eps) - b = bias - running_mean * a - return [ - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), - ] - - # reshape conv weights - if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): - data_torch = data_torch[:, None, None] - if "conv.depthwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[1] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) - if "conv.pointwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[2] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - - return [(self.map_tensor_name(name), data_torch)] - @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 44ca7d553cc..d0605d4d939 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -712,26 +712,37 @@ class MODEL_TENSOR(IntEnum): A_ENC_EMBD_NORM = auto() A_ENC_EMBD_TO_LOGITS = auto() A_ENC_CONV1D = auto() + A_ENC_CONV1D_NORM = auto() # gemma3n A_PRE_NORM = auto() A_POST_NORM = auto() + A_ENC_LAYER_PRE_NORM = auto() # gemma3n A_ENC_ATTN_Q = auto() A_ENC_ATTN_K = auto() A_ENC_ATTN_V = auto() + A_ENC_PER_DIM_SCALE = auto() # gemma3n A_ENC_INPUT_NORM = auto() A_ENC_OUTPUT = auto() A_ENC_OUTPUT_NORM = auto() A_ENC_FFN_UP = auto() A_ENC_FFN_NORM = auto() + A_ENC_FFN_POST_NORM = auto() # gemma3n + A_ENC_FFN_SCALE = auto() # gemma3n A_ENC_FFN_GATE = auto() A_ENC_FFN_DOWN = auto() A_ENC_FFN_UP_1 = auto() A_ENC_FFN_NORM_1 = auto() + A_ENC_FFN_POST_NORM_1 = auto() # gemma3n + A_ENC_FFN_SCALE_1 = auto() # gemma3n A_ENC_FFN_GATE_1 = auto() A_ENC_FFN_DOWN_1 = auto() A_MMPROJ = auto() A_MMPROJ_FC = auto() A_MM_NORM_PRE = auto() A_MM_NORM_MID = auto() + A_MM_EMBEDDING = auto() # gemma3n + A_MM_HARD_EMB_NORM = auto() # gemma3n + A_MM_SOFT_EMB_NORM = auto() # gemma3n + A_MM_INP_PROJ = auto() # gemma3n # nextn/mtp NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() @@ -1081,9 +1092,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", - MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n - MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n - MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n + MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n + MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n @@ -1119,19 +1130,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", + MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm", MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", MODEL_TENSOR.A_POST_NORM: "a.post_ln", + MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm", MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q", MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k", MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v", + MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2", MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm", + MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm", + MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale", MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up", MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1", + MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1", + MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1", MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1", MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1", MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1", @@ -1139,6 +1157,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid", + MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n + MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n + MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n + MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n # lfm2 audio MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", @@ -1225,19 +1247,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_EMBD_NORM, MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS, MODEL_TENSOR.A_ENC_CONV1D, + MODEL_TENSOR.A_ENC_CONV1D_NORM, MODEL_TENSOR.A_PRE_NORM, MODEL_TENSOR.A_POST_NORM, + MODEL_TENSOR.A_ENC_LAYER_PRE_NORM, MODEL_TENSOR.A_ENC_ATTN_Q, MODEL_TENSOR.A_ENC_ATTN_K, MODEL_TENSOR.A_ENC_ATTN_V, + MODEL_TENSOR.A_ENC_PER_DIM_SCALE, MODEL_TENSOR.A_ENC_INPUT_NORM, MODEL_TENSOR.A_ENC_OUTPUT, MODEL_TENSOR.A_ENC_OUTPUT_NORM, MODEL_TENSOR.A_ENC_FFN_NORM, + MODEL_TENSOR.A_ENC_FFN_POST_NORM, + MODEL_TENSOR.A_ENC_FFN_SCALE, MODEL_TENSOR.A_ENC_FFN_UP, MODEL_TENSOR.A_ENC_FFN_GATE, MODEL_TENSOR.A_ENC_FFN_DOWN, MODEL_TENSOR.A_ENC_FFN_NORM_1, + MODEL_TENSOR.A_ENC_FFN_POST_NORM_1, + MODEL_TENSOR.A_ENC_FFN_SCALE_1, MODEL_TENSOR.A_ENC_FFN_UP_1, MODEL_TENSOR.A_ENC_FFN_GATE_1, MODEL_TENSOR.A_ENC_FFN_DOWN_1, @@ -1254,6 +1283,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_CONV_NORM, MODEL_TENSOR.A_ENC_CONV_PW1, MODEL_TENSOR.A_ENC_CONV_PW2, + MODEL_TENSOR.A_MM_INP_PROJ, + MODEL_TENSOR.A_MM_SOFT_EMB_NORM, + MODEL_TENSOR.A_MM_EMBEDDING, + MODEL_TENSOR.A_MM_HARD_EMB_NORM, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 396beb6221d..003d986941f 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1609,6 +1609,11 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV1D: ( "audio_tower.conv{bid}", # ultravox "conformer.pre_encode.conv.{bid}", # lfm2 + "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n + ), + + MODEL_TENSOR.A_ENC_CONV1D_NORM: ( + "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n ), MODEL_TENSOR.A_PRE_NORM: (), @@ -1621,40 +1626,64 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_ATTN_Q: ( "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_q", # lfm2 + "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n ), MODEL_TENSOR.A_ENC_ATTN_K: ( "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_k", # lfm2 + "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n ), MODEL_TENSOR.A_ENC_ATTN_V: ( "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_v", # lfm2 + "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n + ), + + MODEL_TENSOR.A_ENC_PER_DIM_SCALE: ( + "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n + ), + + MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: ( + "conformer.layers.{bid}.norm", # gemma3n ), MODEL_TENSOR.A_ENC_INPUT_NORM: ( "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox "conformer.layers.{bid}.norm_self_att", # lfm2 + "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n ), MODEL_TENSOR.A_ENC_OUTPUT: ( "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox "conformer.layers.{bid}.self_attn.linear_out", # lfm2 + "conformer.layers.{bid}.attention.post", # gemma3n ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( "audio_tower.layers.{bid}.final_layer_norm", # ultravox "conformer.layers.{bid}.norm_out", # lfm2 + "conformer.layers.{bid}.attention.post_norm", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_NORM: ( "conformer.layers.{bid}.norm_feed_forward1", # lfm2 + "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n + ), + + MODEL_TENSOR.A_ENC_FFN_POST_NORM: ( + "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n + ), + + MODEL_TENSOR.A_ENC_FFN_SCALE: ( + "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_UP: ( "audio_tower.layers.{bid}.fc1", # ultravox "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 + "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_GATE: (), @@ -1662,22 +1691,35 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_FFN_DOWN: ( "audio_tower.layers.{bid}.fc2", # ultravox "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 + "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_UP_1: ( "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 + "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 + "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n ), MODEL_TENSOR.A_ENC_FFN_NORM_1: ( "conformer.layers.{bid}.norm_feed_forward2", # lfm2 + "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n + ), + + MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: ( + "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n + ), + + MODEL_TENSOR.A_ENC_FFN_SCALE_1: ( + "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n ), MODEL_TENSOR.A_ENC_LINEAR_POS: ( "conformer.layers.{bid}.self_attn.linear_pos", # lfm2 + "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n ), MODEL_TENSOR.A_ENC_POS_BIAS_U: ( @@ -1690,6 +1732,7 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_OUT: ( "conformer.pre_encode.out", # lfm2 + "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n ), # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors @@ -1715,22 +1758,40 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_DW: ( "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 + "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n ), MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 + "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 + "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 + "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 + "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n + ), + + MODEL_TENSOR.A_MM_EMBEDDING: ( + "model.embed_audio.embedding", # gemma3n + ), + MODEL_TENSOR.A_MM_HARD_EMB_NORM: ( + "model.embed_audio.hard_embedding_norm", # gemma3n + ), + MODEL_TENSOR.A_MM_INP_PROJ: ( + "model.embed_audio.embedding_projection", # gemma3n + ), + MODEL_TENSOR.A_MM_SOFT_EMB_NORM: ( + "model.embed_audio.soft_embedding_norm", # gemma3n ), # NextN/MTP tensors for GLM4_MOE diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f1b74d866f2..be383f01a1d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1346,6 +1346,10 @@ struct clip_model_loader { model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); + if (model.proj_type == PROJECTOR_TYPE_GEMMA3N) { + hparams.n_layer = 0; // gemma3n does not use normal layer structure + } + // layers model.layers.resize(hparams.n_layer); for (int il = 0; il < hparams.n_layer; ++il) { @@ -2108,6 +2112,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params try { clip_model_loader loader(fname); + bool skip_audio = false; if (loader.has_vision) { ctx_vision = new clip_ctx(ctx_params); @@ -2116,11 +2121,15 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params if (ctx_params.warmup) { loader.warmup(*ctx_vision); } + + // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors + // we can remove this check when we implement audio support for Gemma 3N + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N; // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); } - if (loader.has_audio) { + if (loader.has_audio && !skip_audio) { ctx_audio = new clip_ctx(ctx_params); loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); loader.load_tensors(*ctx_audio); From 6a68b35e7e73dfeb7d177b647231a85f6dd4242a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 18:44:45 +0100 Subject: [PATCH 17/20] trailing space --- tools/mtmd/clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index be383f01a1d..81a8ff04c1f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2121,7 +2121,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params if (ctx_params.warmup) { loader.warmup(*ctx_vision); } - + // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors // we can remove this check when we implement audio support for Gemma 3N skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N; From e842b9314a76bb4d7c346c607f08338438f48bb6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 22:16:15 +0100 Subject: [PATCH 18/20] split arch A and V --- convert_hf_to_gguf.py | 30 +++++++++++++++++------------- gguf-py/gguf/constants.py | 13 ++++++++----- gguf-py/gguf/gguf_writer.py | 6 ++++++ tools/mtmd/clip-impl.h | 6 ++++-- tools/mtmd/clip.cpp | 18 +++++++++--------- tools/mtmd/models/mobilenetv5.cpp | 18 ++++++++---------- tools/mtmd/mtmd.cpp | 14 +++++++++----- 7 files changed, 61 insertions(+), 44 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cefaaa712d9..ead180523c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6056,18 +6056,6 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip language model tensors - if name.startswith("lfm."): - return [] - - # for training only - if any(p in name for p in ["audio_loss_weight"]): - return [] - - # for audio output - if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return [] - # fold running_mean, running_var and eps into weight and bias for batch_norm if "batch_norm" in name: if self._batch_norm_tensors is None: @@ -6165,13 +6153,14 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3N) # vision params + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV) self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) # audio params assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA) self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) @@ -10321,6 +10310,21 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + def modify_tensors(self, data_torch, name, bid): + # skip language model tensors + if name.startswith("lfm."): + return [] + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return [] + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return [] + + return super().modify_tensors(data_torch, name, bid) + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d0605d4d939..3e549883577 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -276,12 +276,13 @@ class IMatrix: DATASETS = "imatrix.datasets" class Clip: - PROJECTOR_TYPE = "clip.projector_type" - HAS_VISION_ENCODER = "clip.has_vision_encoder" - HAS_AUDIO_ENCODER = "clip.has_audio_encoder" - HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_AUDIO_ENCODER = "clip.has_audio_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" class ClipVision: + PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models IMAGE_SIZE = "clip.vision.image_size" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" @@ -307,6 +308,7 @@ class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" class ClipAudio: + PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models NUM_MEL_BINS = "clip.audio.num_mel_bins" EMBEDDING_LENGTH = "clip.audio.embedding_length" FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length" @@ -3557,7 +3559,8 @@ def get_type(val: Any) -> GGUFValueType: class VisionProjectorType: GEMMA3 = "gemma3" - GEMMA3N = "gemma3n" + GEMMA3NV = "gemma3nv" + GEMMA3NA = "gemma3na" IDEFICS3 = "idefics3" PIXTRAL = "pixtral" LLAMA4 = "llama4" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a7506aa7934..7fbb78866bc 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1086,6 +1086,9 @@ def add_clip_has_audio_encoder(self, value: bool) -> None: def add_clip_projector_type(self, value: str) -> None: self.add_string(Keys.Clip.PROJECTOR_TYPE, value) + def add_clip_vision_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value) + def add_vision_projection_dim(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value) @@ -1168,6 +1171,9 @@ def add_vision_window_size(self, value: int) -> None: # audio models + def add_clip_audio_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value) + def add_audio_projection_dim(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c6ace9d81bb..dd693623a26 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -212,7 +212,8 @@ enum projector_type { PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN3VL, PROJECTOR_TYPE_GEMMA3, - PROJECTOR_TYPE_GEMMA3N, + PROJECTOR_TYPE_GEMMA3NV, + PROJECTOR_TYPE_GEMMA3NA, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, PROJECTOR_TYPE_QWEN25VL, @@ -245,7 +246,8 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, - { PROJECTOR_TYPE_GEMMA3N, "gemma3n"}, + { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, + { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 81a8ff04c1f..9e941638da7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -788,7 +788,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: { builder = std::make_unique(ctx, img); } break; @@ -1151,7 +1151,7 @@ struct clip_model_loader { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: { // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16) // Similar configuration to Gemma3 @@ -1346,7 +1346,7 @@ struct clip_model_loader { model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); - if (model.proj_type == PROJECTOR_TYPE_GEMMA3N) { + if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) { hparams.n_layer = 0; // gemma3n does not use normal layer structure } @@ -1564,7 +1564,7 @@ struct clip_model_loader { model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: { model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false); @@ -2124,7 +2124,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors // we can remove this check when we implement audio support for Gemma 3N - skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3N; + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); } @@ -2967,7 +2967,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); } break; - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: { clip_image_u8 resized_image; int sz = params.image_size; @@ -3239,7 +3239,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int scale_factor = ctx->model.hparams.n_merge; n_patches /= (scale_factor * scale_factor); } break; - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: { // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution // regardless of input size (see architecture description) @@ -3637,7 +3637,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("patches", patches); } break; case PROJECTOR_TYPE_GEMMA3: - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_QWEN2A: @@ -3765,7 +3765,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { // main path + deepstack paths return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); case PROJECTOR_TYPE_GEMMA3: - case PROJECTOR_TYPE_GEMMA3N: + case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_IDEFICS3: return ctx->model.projection->ne[1]; diff --git a/tools/mtmd/models/mobilenetv5.cpp b/tools/mtmd/models/mobilenetv5.cpp index 78db081ea32..593afa1ddce 100644 --- a/tools/mtmd/models/mobilenetv5.cpp +++ b/tools/mtmd/models/mobilenetv5.cpp @@ -322,9 +322,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // int scale_h = high_res_h / feat_h; // Safety check for non-integer scaling if strictly replicating - if (high_res_w % feat_w != 0) { - LOG_WRN("%s: non-integer scaling detected\n", __func__); - } + GGML_ASSERT(high_res_w % feat_w == 0); // Upsample (Nearest Neighbor) // 2 is the scale factor @@ -375,12 +373,10 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { if (current_w > target_out_res) { int s = current_w / target_out_res; - if (current_w % target_out_res == 0) { - // Avg Pool: Kernel=s, Stride=s - cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); - } else { - LOG_ERR("%s: irregular downsampling stride required\n", __func__); - } + GGML_ASSERT(current_w % target_out_res == 0); + + // Avg Pool: Kernel=s, Stride=s + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); } @@ -395,9 +391,11 @@ ggml_cgraph * clip_graph_mobilenetv5::build() { // Input: 'cur' is [Width, Height, Channels, Batch] int W = cur->ne[0]; int H = cur->ne[1]; - int C = cur->ne[2]; // Should be 2048 + int C = cur->ne[2]; int B = cur->ne[3]; + GGML_ASSERT(C == hparams.n_embd); + // 1. Permute and Flatten to [Channels, Tokens, Batch] // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch) cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B] diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 23cc8ffd30d..b68de74296e 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -266,7 +266,7 @@ struct mtmd_context { } // set boi/eoi - if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3N) { + if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) { // ... (image embeddings) ... img_beg = ""; img_end = ""; @@ -862,11 +862,15 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { } bool mtmd_decode_use_non_causal(mtmd_context * ctx) { - if (ctx->ctx_v && - (clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3 || clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3N)) { - return true; + switch (ctx->proj_type_v()) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_YOUTUVL: + return true; + default: + return false; } - return false; } bool mtmd_decode_use_mrope(mtmd_context * ctx) { From 8f6dbbe4c1b3443455918d4be4ff27c7e955e1f9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 22:20:17 +0100 Subject: [PATCH 19/20] rm unused gemma3 func --- tools/mtmd/clip.cpp | 34 +++++++++++++++++++++------------- tools/mtmd/clip.h | 3 ++- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9e941638da7..97c83de5fb3 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3796,6 +3796,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } int clip_is_minicpmv(const struct clip_ctx * ctx) { + // TODO: remove this function if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { return ctx->model.hparams.minicpmv_version; } @@ -3803,24 +3804,26 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { } bool clip_is_glm(const struct clip_ctx * ctx) { + // TODO: remove this function return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; } bool clip_is_mrope(const struct clip_ctx * ctx) { - return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL - || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL - || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL - || ctx->proj_type() == PROJECTOR_TYPE_GLM4V; + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + return true; + default: + return false; + } } bool clip_is_llava(const struct clip_ctx * ctx) { return ctx->model.hparams.has_llava_projector; } -bool clip_is_gemma3(const struct clip_ctx * ctx) { - return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; -} - bool clip_has_vision_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_VISION; } @@ -3830,11 +3833,16 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { } bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { - return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX - || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A - || ctx->proj_type() == PROJECTOR_TYPE_GLMA - || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL - || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO; + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + return true; + default: + return false; + } } bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 68a0d6e857e..79df0136ba7 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -106,7 +106,8 @@ int clip_is_minicpmv(const struct clip_ctx * ctx); bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_mrope(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); -bool clip_is_gemma3(const struct clip_ctx * ctx); +// note for contributor: this clip_is_(model) pattern is deprecated +// do NOT add new functions like this bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); From 60c23c9a532bca6d123596710fe67c2e98892b87 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 9 Jan 2026 22:27:42 +0100 Subject: [PATCH 20/20] fix alignment --- gguf-py/gguf/constants.py | 82 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3e549883577..b240e8e4a6b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -680,13 +680,13 @@ class MODEL_TENSOR(IntEnum): V_MM_SOFT_EMB_NORM = auto() # gemma3 V_MM_EMBEDDING = auto() # gemma3n V_MM_HARD_EMB_NORM = auto() # gemma3n - V_ENC_CONV_STEM = auto() # gemma3n - V_ENC_CONV_STEM_NORM = auto() # gemma3n - V_ENC_MSFA_EXP = auto() # gemma3n - V_ENC_MSFA_EXP_NORM = auto() # gemma3n - V_ENC_MSFA_PROJ = auto() # gemma3n - V_ENC_MSFA_PROJ_NORM = auto() # gemma3n - V_ENC_MSFA_NORM = auto() # gemma3n + V_ENC_CONV_STEM = auto() # gemma3n + V_ENC_CONV_STEM_NORM = auto() # gemma3n + V_ENC_MSFA_EXP = auto() # gemma3n + V_ENC_MSFA_EXP_NORM = auto() # gemma3n + V_ENC_MSFA_PROJ = auto() # gemma3n + V_ENC_MSFA_PROJ_NORM = auto() # gemma3n + V_ENC_MSFA_NORM = auto() # gemma3n V_RESMPL_POS_EMBD_K = auto() # minicpmv V_RESMPL_ATTN_Q = auto() # minicpmv V_RESMPL_ATTN_K = auto() # minicpmv @@ -710,41 +710,41 @@ class MODEL_TENSOR(IntEnum): V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm # audio (mtmd) - A_ENC_EMBD_POS = auto() - A_ENC_EMBD_NORM = auto() - A_ENC_EMBD_TO_LOGITS = auto() - A_ENC_CONV1D = auto() - A_ENC_CONV1D_NORM = auto() # gemma3n - A_PRE_NORM = auto() - A_POST_NORM = auto() - A_ENC_LAYER_PRE_NORM = auto() # gemma3n - A_ENC_ATTN_Q = auto() - A_ENC_ATTN_K = auto() - A_ENC_ATTN_V = auto() - A_ENC_PER_DIM_SCALE = auto() # gemma3n - A_ENC_INPUT_NORM = auto() - A_ENC_OUTPUT = auto() - A_ENC_OUTPUT_NORM = auto() - A_ENC_FFN_UP = auto() - A_ENC_FFN_NORM = auto() - A_ENC_FFN_POST_NORM = auto() # gemma3n - A_ENC_FFN_SCALE = auto() # gemma3n - A_ENC_FFN_GATE = auto() - A_ENC_FFN_DOWN = auto() - A_ENC_FFN_UP_1 = auto() - A_ENC_FFN_NORM_1 = auto() + A_ENC_EMBD_POS = auto() + A_ENC_EMBD_NORM = auto() + A_ENC_EMBD_TO_LOGITS = auto() # lfm2 + A_ENC_CONV1D = auto() + A_ENC_CONV1D_NORM = auto() # gemma3n + A_PRE_NORM = auto() + A_POST_NORM = auto() + A_ENC_LAYER_PRE_NORM = auto() # gemma3n + A_ENC_ATTN_Q = auto() + A_ENC_ATTN_K = auto() + A_ENC_ATTN_V = auto() + A_ENC_PER_DIM_SCALE = auto() # gemma3n + A_ENC_INPUT_NORM = auto() + A_ENC_OUTPUT = auto() + A_ENC_OUTPUT_NORM = auto() + A_ENC_FFN_UP = auto() + A_ENC_FFN_NORM = auto() + A_ENC_FFN_POST_NORM = auto() # gemma3n + A_ENC_FFN_SCALE = auto() # gemma3n + A_ENC_FFN_GATE = auto() + A_ENC_FFN_DOWN = auto() + A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n + A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm) A_ENC_FFN_POST_NORM_1 = auto() # gemma3n - A_ENC_FFN_SCALE_1 = auto() # gemma3n - A_ENC_FFN_GATE_1 = auto() - A_ENC_FFN_DOWN_1 = auto() - A_MMPROJ = auto() - A_MMPROJ_FC = auto() - A_MM_NORM_PRE = auto() - A_MM_NORM_MID = auto() - A_MM_EMBEDDING = auto() # gemma3n - A_MM_HARD_EMB_NORM = auto() # gemma3n - A_MM_SOFT_EMB_NORM = auto() # gemma3n - A_MM_INP_PROJ = auto() # gemma3n + A_ENC_FFN_SCALE_1 = auto() # gemma3n + A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n + A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n + A_MMPROJ = auto() + A_MMPROJ_FC = auto() + A_MM_NORM_PRE = auto() + A_MM_NORM_MID = auto() + A_MM_EMBEDDING = auto() # gemma3n + A_MM_HARD_EMB_NORM = auto() # gemma3n + A_MM_SOFT_EMB_NORM = auto() # gemma3n + A_MM_INP_PROJ = auto() # gemma3n # nextn/mtp NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto()