From 4ac118393475a8ab3a645c3bbc652d1886bcf0f1 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 20 Mar 2026 11:15:03 +0800 Subject: [PATCH 1/3] [regression] qkv_scale is empty due to offload in autoscheme. Signed-off-by: Xin He --- auto_round/experimental/utils.py | 7 +++- test/test_cpu/schemes/test_auto_scheme.py | 43 +++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index e90f9c0d5..b38f33d16 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -35,7 +35,12 @@ def update_parameter_data(module: torch.nn.Module, new_val: torch.Tensor, name: if hasattr(module, name): param = getattr(module, name) if isinstance(param, torch.nn.Parameter): - param.data.copy_(new_val) + if param.shape == new_val.shape: + param.data.copy_(new_val) + else: + # Re-create the parameter when shapes differ (e.g. after offload + # cleared it to an empty tensor). + module.register_parameter(name, torch.nn.Parameter(new_val.clone(), requires_grad=param.requires_grad)) else: module.register_parameter(name, torch.nn.Parameter(new_val)) else: diff --git a/test/test_cpu/schemes/test_auto_scheme.py b/test/test_cpu/schemes/test_auto_scheme.py index 9bd362bf3..804e58a2d 100644 --- a/test/test_cpu/schemes/test_auto_scheme.py +++ b/test/test_cpu/schemes/test_auto_scheme.py @@ -54,3 +54,46 @@ def test_layer_config(self, tiny_opt_model_path): avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 + + def test_autoscheme_mxfp_with_static_kv(self, tiny_opt_model_path): + """MXFP4+MXFP8 AutoScheme with static_kv_dtype='fp8' should yield + non-zero k_scale and v_scale on every attention layer.""" + model_name = tiny_opt_model_path + + scheme = AutoScheme( + avg_bits=5.0, + options=("MXFP4", "MXFP8"), + nsamples=2, + seqlen=8, + ignore_scale_zp_bits=True, + ) + ar = AutoRound( + tiny_opt_model_path, + scheme=scheme, + static_kv_dtype="fp8", + iters=0, + nsamples=2, + seqlen=8, + ) + quantized_model, _ = ar.quantize_and_save( + format="fake", + output_dir=self.save_dir, + ) + + # After quantize_and_save, the model's attention modules should have + # k_scale and v_scale registered as parameters with non-zero values. + attn_modules = quantized_model.model.decoder.layers[0].self_attn + assert len(attn_modules) > 0, "No attention modules found in quantized model" + for name, attn in attn_modules: + assert hasattr(attn, "k_scale"), f"{name} missing k_scale after quantization" + assert hasattr(attn, "v_scale"), f"{name} missing v_scale after quantization" + k_val = attn.k_scale.item() + v_val = attn.v_scale.item() + assert k_val != 0.0, ( + f"{name} k_scale is 0.0 — scale was not collected during " + f"calibration with AutoScheme + static_kv_dtype" + ) + assert v_val != 0.0, ( + f"{name} v_scale is 0.0 — scale was not collected during " + f"calibration with AutoScheme + static_kv_dtype" + ) From 1b262dd6b7ec6b7e5681160f6db2407a32d99760 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 20 Mar 2026 11:35:29 +0800 Subject: [PATCH 2/3] fix bug Signed-off-by: Xin He --- test/test_cpu/schemes/test_auto_scheme.py | 26 ++++++++++------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/test/test_cpu/schemes/test_auto_scheme.py b/test/test_cpu/schemes/test_auto_scheme.py index 804e58a2d..fd97e35bd 100644 --- a/test/test_cpu/schemes/test_auto_scheme.py +++ b/test/test_cpu/schemes/test_auto_scheme.py @@ -82,18 +82,14 @@ def test_autoscheme_mxfp_with_static_kv(self, tiny_opt_model_path): # After quantize_and_save, the model's attention modules should have # k_scale and v_scale registered as parameters with non-zero values. - attn_modules = quantized_model.model.decoder.layers[0].self_attn - assert len(attn_modules) > 0, "No attention modules found in quantized model" - for name, attn in attn_modules: - assert hasattr(attn, "k_scale"), f"{name} missing k_scale after quantization" - assert hasattr(attn, "v_scale"), f"{name} missing v_scale after quantization" - k_val = attn.k_scale.item() - v_val = attn.v_scale.item() - assert k_val != 0.0, ( - f"{name} k_scale is 0.0 — scale was not collected during " - f"calibration with AutoScheme + static_kv_dtype" - ) - assert v_val != 0.0, ( - f"{name} v_scale is 0.0 — scale was not collected during " - f"calibration with AutoScheme + static_kv_dtype" - ) + attn = quantized_model.model.decoder.layers[0].self_attn + assert hasattr(attn, "k_scale"), "missing k_scale after quantization" + assert hasattr(attn, "v_scale"), " missing v_scale after quantization" + k_val = attn.k_scale.item() + v_val = attn.v_scale.item() + assert k_val != 0.0, ( + "k_scale is 0.0 — scale was not collected during " "calibration with AutoScheme + static_kv_dtype" + ) + assert v_val != 0.0, ( + "v_scale is 0.0 — scale was not collected during " "calibration with AutoScheme + static_kv_dtype" + ) From b23e634d49d4306e40dc88430715c124437fa72f Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 20 Mar 2026 13:28:38 +0800 Subject: [PATCH 3/3] update ut Signed-off-by: Xin He --- test/test_cpu/schemes/test_auto_scheme.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/test_cpu/schemes/test_auto_scheme.py b/test/test_cpu/schemes/test_auto_scheme.py index fd97e35bd..fe7b18511 100644 --- a/test/test_cpu/schemes/test_auto_scheme.py +++ b/test/test_cpu/schemes/test_auto_scheme.py @@ -57,9 +57,7 @@ def test_layer_config(self, tiny_opt_model_path): def test_autoscheme_mxfp_with_static_kv(self, tiny_opt_model_path): """MXFP4+MXFP8 AutoScheme with static_kv_dtype='fp8' should yield - non-zero k_scale and v_scale on every attention layer.""" - model_name = tiny_opt_model_path - + non-zero k_scale and v_scale on the first attention layer.""" scheme = AutoScheme( avg_bits=5.0, options=("MXFP4", "MXFP8"), @@ -74,6 +72,7 @@ def test_autoscheme_mxfp_with_static_kv(self, tiny_opt_model_path): iters=0, nsamples=2, seqlen=8, + disable_opt_rtn=True, ) quantized_model, _ = ar.quantize_and_save( format="fake", @@ -84,7 +83,7 @@ def test_autoscheme_mxfp_with_static_kv(self, tiny_opt_model_path): # k_scale and v_scale registered as parameters with non-zero values. attn = quantized_model.model.decoder.layers[0].self_attn assert hasattr(attn, "k_scale"), "missing k_scale after quantization" - assert hasattr(attn, "v_scale"), " missing v_scale after quantization" + assert hasattr(attn, "v_scale"), "missing v_scale after quantization" k_val = attn.k_scale.item() v_val = attn.v_scale.item() assert k_val != 0.0, (