From 853247e43dce60326f15e04026a384f11c9dcb52 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 22 May 2026 11:08:54 +0000 Subject: [PATCH 01/11] Support MoE pad fields in GPT-OSS configs Signed-off-by: root Co-authored-by: Cursor --- aiter/aot/flydsl/moe.py | 27 ++++++++++ .../model_configs/gptoss_fp4_tuned_fmoe.csv | 50 +++++++++---------- aiter/fused_moe.py | 24 ++++++++- aiter/jit/core.py | 12 ++++- op_tests/test_moe_2stage.py | 15 ++++-- 5 files changed, 97 insertions(+), 31 deletions(-) diff --git a/aiter/aot/flydsl/moe.py b/aiter/aot/flydsl/moe.py index c7e7e95adf..09c5661f52 100644 --- a/aiter/aot/flydsl/moe.py +++ b/aiter/aot/flydsl/moe.py @@ -71,6 +71,21 @@ def _row_swiglu_limit(row: dict[str, str]) -> float: return _parse_optional_float(row.get("swiglu_limit"), "swiglu_limit") or 0.0 +def _row_optional_int(row: dict[str, str], name: str, *aliases: str) -> int: + for key in (name, *aliases): + value = row.get(key) + if value is None: + continue + value = str(value).strip() + if value == "": + return 0 + try: + return int(value) + except ValueError as e: + raise ValueError(f"{key} must be an int, got {value!r}") from e + return 0 + + def parse_csv(csv_path: str): """Parse the CSV and return a list of unique compile jobs. @@ -93,6 +108,8 @@ def parse_csv(csv_path: str): experts = int(row["expert"]) topk = int(row["topk"]) doweight_stage1 = bool(int(row.get("doweight_stage1", "0"))) + hidden_pad = _row_optional_int(row, "hidden_pad", "hiddne_pad") + intermediate_pad = _row_optional_int(row, "intermediate_pad") cu_num = int(row.get("cu_num", "0")) block_m = int(row.get("block_m", "0") or "0") act_type = row.get("act_type", "") @@ -143,6 +160,8 @@ def parse_csv(csv_path: str): "experts": experts, "topk": topk, "doweight_stage1": doweight_stage1, + "hidden_pad": hidden_pad, + "intermediate_pad": intermediate_pad, "cu_num": cu_num, "act": act, "enable_bias": enable_bias, @@ -200,6 +219,8 @@ def _precompile_to_cache( enable_bias: bool = False, stage1_fuse_quant=None, swiglu_limit: float = 0.0, + hidden_pad: int = 0, + intermediate_pad: int = 0, **kwargs, ): """Trigger MLIR compilation by calling the runtime stage1/stage2 entry points @@ -554,6 +575,8 @@ def _make_a_user(a_dtype_user_shape): a_scale_one=a_scale_one, xcd_swizzle=xcd_swizzle, swiglu_limit=swiglu_limit, + model_dim_pad=hidden_pad, + inter_dim_pad=intermediate_pad, ) _run_compiled(exe, args) @@ -723,6 +746,8 @@ def _make_a_user(a_dtype_user_shape): b_nt=b_nt, xcd_swizzle=xcd_swizzle, enable_bias=enable_bias, + model_dim_pad=hidden_pad, + inter_dim_pad=intermediate_pad, ) _run_compiled(exe, args) @@ -747,6 +772,8 @@ def compile_one_config( shape_str = ( f"{kernel_name} " f"model_dim={model_dim} inter_dim={inter_dim} " + f"hidden_pad={kwargs.get('hidden_pad', 0)} " + f"intermediate_pad={kwargs.get('intermediate_pad', 0)} " f"E={experts} topk={topk}" ) result = { diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index e1bc608258..ccb625baf2 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -1,25 +1,25 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag -256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,182.57,11418.01, -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,356.87,11166.78, -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,703.24,11016.73, -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,1052.15,8262.71, -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,1508.93,5955.64, -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,948.24,1890.61, -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,1384.38,1408.26, -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,1665.09,880.78, -256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,181.84,11379.53, -256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,346.54,10857.58, -256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,643.71,10110.43, -256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,956.4,7549.71, -256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,1246.25,4969.6, -256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,1520.68,3093.83, -256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,1769.69,1872.23, -256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,1943.37,1107.06, -256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,144.56,9070.37, -256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,266.72,8400.06, -256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,473.78,7518.51, -256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,699.93,5639.12, -256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,897.88,3726.57, -256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,1082.99,2379.61, -256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,1257.9,1535.52, -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,1397.92,1023.87, +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,hidden_pad,intermediate_pad,tflops,bw,_tag +256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,192,192,182.57,11418.01, +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,192,192,356.87,11166.78, +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,192,192,703.24,11016.73, +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,192,192,1052.15,8262.71, +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,192,192,1508.93,5955.64, +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,192,192,948.24,1890.61, +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,192,192,1384.38,1408.26, +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,192,192,1665.09,880.78, +256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,192,96,181.84,11379.53, +256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,192,96,346.54,10857.58, +256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,192,96,643.71,10110.43, +256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,192,96,956.4,7549.71, +256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,192,96,1246.25,4969.6, +256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,192,96,1520.68,3093.83, +256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,192,96,1769.69,1872.23, +256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,192,96,1943.37,1107.06, +256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,192,32,144.56,9070.37, +256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,192,32,266.72,8400.06, +256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,192,32,473.78,7518.51, +256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,192,32,699.93,5639.12, +256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,192,32,897.88,3726.57, +256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,192,32,1082.99,2379.61, +256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,192,32,1257.9,1535.52, +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,192,32,1397.92,1023.87, diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 131a92d3c7..586b7e5ad4 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -880,12 +880,27 @@ def get_2stage_cfgs( "q_type", "use_g1u1", "doweight_stage1", + "hidden_pad", + "intermediate_pad", ] + def _normalize_pad_cols(df): + if "hidden_pad" not in df.columns: + if "hiddne_pad" in df.columns: + df["hidden_pad"] = df["hiddne_pad"] + else: + df["hidden_pad"] = 0 + df["hidden_pad"] = df["hidden_pad"].fillna(0).astype(int) + if "intermediate_pad" not in df.columns: + df["intermediate_pad"] = 0 + df["intermediate_pad"] = df["intermediate_pad"].fillna(0).astype(int) + return df + def get_cfg_2stages(tune_file): import pandas as pd df = pd.read_csv(tune_file) + df = _normalize_pad_cols(df) if "_tag" in df.columns: df = df[df["_tag"].fillna("") == ""] @@ -927,6 +942,7 @@ def get_flydsl_fallback_cfgs(tune_file): _flydsl_fallback_cache[tune_file] = {} return {} df = pd.read_csv(tune_file) + df = _normalize_pad_cols(df) if "_tag" not in df.columns: _flydsl_fallback_cache[tune_file] = {} return {} @@ -969,6 +985,8 @@ def get_flydsl_fallback_cfgs(tune_file): str(q_type), use_g1u1, doweight_stage1, + hidden_pad, + intermediate_pad, ) keys_disabled = ( cu_num, @@ -984,17 +1002,19 @@ def get_flydsl_fallback_cfgs(tune_file): str(q_type), use_g1u1, doweight_stage1, + hidden_pad, + intermediate_pad, ) def MainFunc(): with open(untune_file, "a") as f: if os.path.getsize(untune_file) == 0: f.write( - "token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1" + "token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,hidden_pad,intermediate_pad" ) q_dtype_ws = q_dtype_w if q_dtype_w != torch.uint32 else "torch.int4" f.write( - f"\n{token},{model_dim},{inter_dim},{expert},{topk},{activation},{dtype},{q_dtype_a},{q_dtype_ws},{q_type},{int(use_g1u1)},{int(doweight_stage1)}" + f"\n{token},{model_dim},{inter_dim},{expert},{topk},{activation},{dtype},{q_dtype_a},{q_dtype_ws},{q_type},{int(use_g1u1)},{int(doweight_stage1)},{hidden_pad},{intermediate_pad}" ) logger.info("\033[34m Start tuning fmoe") os.system( diff --git a/aiter/jit/core.py b/aiter/jit/core.py index d42155df10..26752718d4 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -215,7 +215,14 @@ def update_config_files(self, file_path: str, merge_name: str): f"when merging '{merge_name}'." ) - _FILL_DEFAULTS = {"xbf16": 0, "run_1stage": 0, "ksplit": 0} + _FILL_DEFAULTS = { + "xbf16": 0, + "run_1stage": 0, + "ksplit": 0, + "hidden_pad": 0, + "hiddne_pad": 0, + "intermediate_pad": 0, + } all_cols = list(source_pairs[0][1].columns) for _, df in source_pairs[1:]: for c in df.columns: @@ -252,6 +259,9 @@ def update_config_files(self, file_path: str, merge_name: str): keys.append("cu_num") if "gfx" in merge_df.columns and "gfx" not in keys: keys.append("gfx") + for pad_col in ("hidden_pad", "hiddne_pad", "intermediate_pad"): + if pad_col in merge_df.columns and pad_col not in keys: + keys.append(pad_col) dedup_keys = keys + ["_tag"] if has_tag else keys duplicated_mask = merge_df.duplicated(subset=dedup_keys, keep=False) if duplicated_mask.any(): diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index 16c3f55ea7..b9639e2eb5 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -583,7 +583,16 @@ def _str2enum(s, enum_cls): def _row_to_kwargs(row): - # csv rows store already-effective dims, so pad defaults to 0. + def _row_int(name, *aliases): + for key in (name, *aliases): + if key not in row: + continue + value = row.get(key) + if pd.isna(value) or str(value).strip() == "": + return 0 + return int(value) + return 0 + q_type = _str2enum(row["q_type"], aiter.QuantType) aq_dtype = _str2dtype(row["q_dtype_a"]) wq_dtype = _str2dtype(row["q_dtype_w"]) @@ -605,8 +614,8 @@ def _row_to_kwargs(row): WQDType=wq_dtype, use_g1u1=dtypes.str2bool(str(row["use_g1u1"])), doweight_stage1=dtypes.str2bool(str(row["doweight_stage1"])), - hidden_pad=0, - intermediate_pad=0, + hidden_pad=_row_int("hidden_pad", "hiddne_pad"), + intermediate_pad=_row_int("intermediate_pad"), preshuffle=True, ) From 6ec4f7a872b0d56f8b92f459ba228b8bfd2f42a5 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 22 May 2026 12:03:24 +0000 Subject: [PATCH 02/11] fix config --- .../model_configs/gptoss_fp4_tuned_fmoe.csv | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index ccb625baf2..62c97933a3 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -15,11 +15,11 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,192,96,1520.68,3093.83, 256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,192,96,1769.69,1872.23, 256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,192,96,1943.37,1107.06, -256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,192,32,144.56,9070.37, -256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,192,32,266.72,8400.06, -256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,192,32,473.78,7518.51, -256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,192,32,699.93,5639.12, -256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,192,32,897.88,3726.57, -256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,192,32,1082.99,2379.61, -256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,192,32,1257.9,1535.52, -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,192,32,1397.92,1023.87, +256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,192,152,144.56,9070.37, +256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,192,152,266.72,8400.06, +256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,192,152,473.78,7518.51, +256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,192,152,699.93,5639.12, +256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,192,152,897.88,3726.57, +256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,192,152,1082.99,2379.61, +256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,192,152,1257.9,1535.52, +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,192,152,1397.92,1023.87, From 722ac537b3aedcb204a998f39405904280966929 Mon Sep 17 00:00:00 2001 From: coderfeli Date: Sat, 23 May 2026 11:08:10 +0000 Subject: [PATCH 03/11] Fix MoE padded dimension test reference Co-authored-by: Cursor --- op_tests/test_moe_2stage.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index b9639e2eb5..a5a170e05b 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -71,6 +71,11 @@ def test_fmoe( ): if get_gfx() not in ["gfx950"] and qType in [aiter.QuantType.per_1x32]: return + assert 0 <= hidden_pad < model_dim, f"invalid hidden_pad={hidden_pad} for model_dim={model_dim}" + assert ( + 0 <= intermediate_pad < inter_dim + ), f"invalid intermediate_pad={intermediate_pad} for inter_dim={inter_dim}" + torch_quant = aiter.get_torch_quant(qType) input = torch.randn((token, model_dim), dtype=dtype) if use_g1u1: @@ -81,15 +86,27 @@ def test_fmoe( w1[:, -intermediate_pad:, :] = 0 w1[:, inter_dim - intermediate_pad : inter_dim, :] = 0 exp_bias1 = torch.clamp(torch.randn((E, inter_dim * 2), dtype=dtype), -1.0, 1.0) + # Dense torch reference still evaluates padded lanes; keep padded + # bias zero so invalid lanes do not affect activation quantization. + if intermediate_pad != 0: + exp_bias1[:, -intermediate_pad:] = 0 + exp_bias1[:, inter_dim - intermediate_pad : inter_dim] = 0 else: w1 = torch.randn((E, inter_dim, model_dim), dtype=dtype) exp_bias1 = torch.clamp(torch.randn((E * inter_dim), dtype=dtype), -1.0, 1.0) + # Dense torch reference still evaluates padded lanes; keep padded + # bias zero so invalid lanes do not affect activation quantization. + if intermediate_pad != 0: + exp_bias1.view(E, inter_dim)[:, -intermediate_pad:] = 0 w2 = torch.randn((E, model_dim, inter_dim), dtype=dtype) if intermediate_pad != 0: w2[:, :, -intermediate_pad:] = 0 if hidden_pad != 0: w2[:, -hidden_pad:, :] = 0 exp_bias2 = torch.clamp(torch.randn((E, model_dim), dtype=dtype), -1.0, 1.0) + # The padded hidden tail is outside the logical output dimension. + if hidden_pad != 0: + exp_bias2[:, -hidden_pad:] = 0 if AITER_MOE_EXPERT_BALANCE: score = torch.zeros((token, E), dtype=dtype) start_col = 0 @@ -359,9 +376,12 @@ def weight_per_128x128_quant(weight, quant_dtype): num_iters=5, num_warmup=2, ) + valid_model_dim = model_dim - hidden_pad + out2_ref_check = out2_ref[:, :valid_model_dim] + out2_ck_check = out2_ck[:, :valid_model_dim] err = checkAllclose( - out2_ref, - out2_ck, + out2_ref_check, + out2_ck_check, msg=f"ck_moe_2stages:{us2:>8.2f} us, {token*model_dim*inter_dim*3*topk*2/us2/1000/1000:>8.2f} tflops......(quant:{AQDType})", ) @@ -371,7 +391,7 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): sim = 2 * (x * y).sum() / denominator return 1 - sim - logits_diff = calc_diff(out2_ref, out2_ck) + logits_diff = calc_diff(out2_ref_check, out2_ck_check) if logits_diff > 1e-3: logging.warning( f"logits_diff: {logits_diff} is too large, please check the implementation" From 16bd0b4f1883d39afa17ca3dd0d7bebe6fcbd1cc Mon Sep 17 00:00:00 2001 From: coderfeli Date: Sat, 23 May 2026 11:31:04 +0000 Subject: [PATCH 04/11] Apply Black formatting to MoE pad test Co-authored-by: Cursor --- op_tests/test_moe_2stage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index a5a170e05b..88481cccda 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -71,7 +71,9 @@ def test_fmoe( ): if get_gfx() not in ["gfx950"] and qType in [aiter.QuantType.per_1x32]: return - assert 0 <= hidden_pad < model_dim, f"invalid hidden_pad={hidden_pad} for model_dim={model_dim}" + assert ( + 0 <= hidden_pad < model_dim + ), f"invalid hidden_pad={hidden_pad} for model_dim={model_dim}" assert ( 0 <= intermediate_pad < inter_dim ), f"invalid intermediate_pad={intermediate_pad} for inter_dim={inter_dim}" From 73a43d25eb73f6943a3711ae0fee06cc5227a6f1 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 07:54:30 +0000 Subject: [PATCH 05/11] feat(moe): add bias dimension to 2stage config lookup Include bias presence as a tuning/dispatch key for fused MoE 2stage so kernels are selected per (with/without bias). Unify pad column handling via PAD_COLUMNS, migrate legacy untuned CSVs to include the new bias column, and update tuned/untuned GPT-OSS FP4 configs and tests accordingly. Signed-off-by: root --- aiter/aot/flydsl/moe.py | 60 +++++++++----- .../model_configs/gptoss_fp4_tuned_fmoe.csv | 50 +++++------ .../model_configs/gptoss_fp4_untuned_fmoe.csv | 50 +++++------ aiter/fused_moe.py | 83 ++++++++++++++----- aiter/jit/core.py | 9 +- .../gemm_moe_tune.py | 83 +++++++++++++++++++ op_tests/test_moe_2stage.py | 62 +++++++++----- 7 files changed, 281 insertions(+), 116 deletions(-) diff --git a/aiter/aot/flydsl/moe.py b/aiter/aot/flydsl/moe.py index 09c5661f52..257b56e644 100644 --- a/aiter/aot/flydsl/moe.py +++ b/aiter/aot/flydsl/moe.py @@ -71,19 +71,32 @@ def _row_swiglu_limit(row: dict[str, str]) -> float: return _parse_optional_float(row.get("swiglu_limit"), "swiglu_limit") or 0.0 -def _row_optional_int(row: dict[str, str], name: str, *aliases: str) -> int: - for key in (name, *aliases): - value = row.get(key) - if value is None: - continue - value = str(value).strip() - if value == "": - return 0 - try: - return int(value) - except ValueError as e: - raise ValueError(f"{key} must be an int, got {value!r}") from e - return 0 +def _row_optional_int(row: dict[str, str], name: str) -> int: + value = row.get(name) + if value is None: + return 0 + value = str(value).strip() + if value == "": + return 0 + try: + return int(value) + except ValueError as e: + raise ValueError(f"{name} must be an int, got {value!r}") from e + + +def _row_optional_bool(row: dict[str, str], name: str) -> bool | None: + value = row.get(name) + if value is None: + return None + value = str(value).strip() + if value == "": + return None + value_lower = value.lower() + if value_lower in ("1", "true", "t", "yes", "y"): + return True + if value_lower in ("0", "false", "f", "no", "n"): + return False + raise ValueError(f"{name} must be a bool, got {value!r}") def parse_csv(csv_path: str): @@ -108,7 +121,7 @@ def parse_csv(csv_path: str): experts = int(row["expert"]) topk = int(row["topk"]) doweight_stage1 = bool(int(row.get("doweight_stage1", "0"))) - hidden_pad = _row_optional_int(row, "hidden_pad", "hiddne_pad") + hidden_pad = _row_optional_int(row, "hidden_pad") intermediate_pad = _row_optional_int(row, "intermediate_pad") cu_num = int(row.get("cu_num", "0")) block_m = int(row.get("block_m", "0") or "0") @@ -122,15 +135,18 @@ def parse_csv(csv_path: str): dtype = row.get("dtype", "") q_dtype_w = row.get("q_dtype_w", "") swiglu_limit = _row_swiglu_limit(row) - # Cover both runtime bias choices for fp4-weight MoE. Model configs - # share kernel families, and runtime bias selection can vary by - # activation dtype/model semantics. bias_supported = ( q_type.strip().split(".")[-1] == "per_1x32" and dtype in ("torch.bfloat16", "torch.float16") and "float4_e2m1fn_x2" in q_dtype_w ) - enable_bias_options = [False, True] if bias_supported else [False] + bias = _row_optional_bool(row, "bias") + if bias is not None: + enable_bias_options = [bias] + elif bias_supported: + enable_bias_options = [False, True] + else: + enable_bias_options = [False] # Detect stage1's fuse_quant from kernel suffix to align stage2's # a2_scale shape with what runtime actually passes. @@ -759,6 +775,8 @@ def compile_one_config( experts: int, topk: int, cu_num: int = 0, + hidden_pad: int = 0, + intermediate_pad: int = 0, **kwargs, ) -> dict: """Compile one MoE kernel configuration and save to cache. @@ -772,8 +790,8 @@ def compile_one_config( shape_str = ( f"{kernel_name} " f"model_dim={model_dim} inter_dim={inter_dim} " - f"hidden_pad={kwargs.get('hidden_pad', 0)} " - f"intermediate_pad={kwargs.get('intermediate_pad', 0)} " + f"hidden_pad={hidden_pad} " + f"intermediate_pad={intermediate_pad} " f"E={experts} topk={topk}" ) result = { @@ -796,6 +814,8 @@ def compile_one_config( experts=experts, topk=topk, cu_num=cu_num, + hidden_pad=hidden_pad, + intermediate_pad=intermediate_pad, **kwargs, ) elapsed = time.time() - t0 diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index 62c97933a3..22a2058b40 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -1,25 +1,25 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,hidden_pad,intermediate_pad,tflops,bw,_tag -256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,192,192,182.57,11418.01, -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,192,192,356.87,11166.78, -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,192,192,703.24,11016.73, -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,192,192,1052.15,8262.71, -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,192,192,1508.93,5955.64, -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,192,192,948.24,1890.61, -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,192,192,1384.38,1408.26, -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,192,192,1665.09,880.78, -256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,192,96,181.84,11379.53, -256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,192,96,346.54,10857.58, -256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,192,96,643.71,10110.43, -256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,192,96,956.4,7549.71, -256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,192,96,1246.25,4969.6, -256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,192,96,1520.68,3093.83, -256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,192,96,1769.69,1872.23, -256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,192,96,1943.37,1107.06, -256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,192,152,144.56,9070.37, -256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,192,152,266.72,8400.06, -256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,192,152,473.78,7518.51, -256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,192,152,699.93,5639.12, -256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,192,152,897.88,3726.57, -256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,192,152,1082.99,2379.61, -256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,192,152,1257.9,1535.52, -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,192,152,1397.92,1023.87, +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,bias,hidden_pad,intermediate_pad,tflops,bw,_tag +256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,True,192,192,182.57,11418.01, +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,True,192,192,356.87,11166.78, +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,True,192,192,703.24,11016.73, +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,True,192,192,1052.15,8262.71, +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,True,192,192,1508.93,5955.64, +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,True,192,192,948.24,1890.61, +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,True,192,192,1384.38,1408.26, +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,True,192,192,1665.09,880.78, +256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,True,192,96,181.84,11379.53, +256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,True,192,96,346.54,10857.58, +256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,True,192,96,643.71,10110.43, +256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,True,192,96,956.4,7549.71, +256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,True,192,96,1246.25,4969.6, +256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,True,192,96,1520.68,3093.83, +256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,True,192,96,1769.69,1872.23, +256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,True,192,96,1943.37,1107.06, +256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,True,192,152,144.56,9070.37, +256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,True,192,152,266.72,8400.06, +256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,True,192,152,473.78,7518.51, +256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,True,192,152,699.93,5639.12, +256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,True,192,152,897.88,3726.57, +256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,True,192,152,1082.99,2379.61, +256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,True,192,152,1257.9,1535.52, +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,True,192,152,1397.92,1023.87, diff --git a/aiter/configs/model_configs/gptoss_fp4_untuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_untuned_fmoe.csv index ab6652b026..a4dc27ad9d 100644 --- a/aiter/configs/model_configs/gptoss_fp4_untuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_untuned_fmoe.csv @@ -1,25 +1,25 @@ -token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 -256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad +256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96 +256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 +32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152 diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 586b7e5ad4..12494b75d9 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -13,7 +13,14 @@ from aiter import ActivationType, QuantType, dtypes from aiter import get_hip_quant as get_quant from aiter import logger -from aiter.jit.core import AITER_CONFIGS, AITER_CSRC_DIR, PY, bd_dir, mp_lock +from aiter.jit.core import ( + AITER_CONFIGS, + AITER_CSRC_DIR, + PAD_COLUMNS, + PY, + bd_dir, + mp_lock, +) from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard from aiter.ops.flydsl.utils import is_flydsl_available @@ -344,6 +351,7 @@ def fused_moe_( intermediate_pad, isShuffled, gate_mode, + bias=(bias1 is not None or bias2 is not None), ) block_size_M = metadata.block_m if block_size_M is None else block_size_M @@ -864,6 +872,7 @@ def get_2stage_cfgs( intermediate_pad, is_shuffled=True, gate_mode=GateMode.SEPARATED.value, + bias=False, ): gate_mode = GateMode(gate_mode) _INDEX_COLS = [ @@ -880,27 +889,38 @@ def get_2stage_cfgs( "q_type", "use_g1u1", "doweight_stage1", - "hidden_pad", - "intermediate_pad", + "bias", + *PAD_COLUMNS, ] - def _normalize_pad_cols(df): - if "hidden_pad" not in df.columns: - if "hiddne_pad" in df.columns: - df["hidden_pad"] = df["hiddne_pad"] - else: - df["hidden_pad"] = 0 - df["hidden_pad"] = df["hidden_pad"].fillna(0).astype(int) - if "intermediate_pad" not in df.columns: - df["intermediate_pad"] = 0 - df["intermediate_pad"] = df["intermediate_pad"].fillna(0).astype(int) + _BIAS_TRUE = {"true", "1", "yes", "y", "t"} + + def _parse_bias_cell(v): + if v is None: + return False + if isinstance(v, bool): + return v + if isinstance(v, (int, float)): + # NaN -> False + return False if v != v else bool(v) + return str(v).strip().lower() in _BIAS_TRUE + + def _normalize_lookup_cols(df): + for col in PAD_COLUMNS: + if col not in df.columns: + df[col] = 0 + df[col] = df[col].fillna(0).astype(int) + if "bias" not in df.columns: + df["bias"] = False + else: + df["bias"] = df["bias"].map(_parse_bias_cell).astype(bool) return df def get_cfg_2stages(tune_file): import pandas as pd df = pd.read_csv(tune_file) - df = _normalize_pad_cols(df) + df = _normalize_lookup_cols(df) if "_tag" in df.columns: df = df[df["_tag"].fillna("") == ""] @@ -942,7 +962,7 @@ def get_flydsl_fallback_cfgs(tune_file): _flydsl_fallback_cache[tune_file] = {} return {} df = pd.read_csv(tune_file) - df = _normalize_pad_cols(df) + df = _normalize_lookup_cols(df) if "_tag" not in df.columns: _flydsl_fallback_cache[tune_file] = {} return {} @@ -971,6 +991,7 @@ def get_flydsl_fallback_cfgs(tune_file): if cfg_2stages is None: cfg_2stages = get_cfg_2stages(tune_file) cu_num = get_cu_num() + bias_key = bool(bias) keys = ( cu_num, token, @@ -985,6 +1006,7 @@ def get_flydsl_fallback_cfgs(tune_file): str(q_type), use_g1u1, doweight_stage1, + bias_key, hidden_pad, intermediate_pad, ) @@ -1002,19 +1024,41 @@ def get_flydsl_fallback_cfgs(tune_file): str(q_type), use_g1u1, doweight_stage1, + bias_key, hidden_pad, intermediate_pad, ) def MainFunc(): + header = "token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad" + # Migrate legacy untuned CSV (no `bias` column) so appended rows stay aligned. + if os.path.exists(untune_file) and os.path.getsize(untune_file) > 0: + with open(untune_file, "r") as f: + lines = f.read().splitlines() + if lines and "bias" not in lines[0].split(","): + old_cols = lines[0].split(",") + try: + insert_at = old_cols.index("doweight_stage1") + 1 + except ValueError: + insert_at = len(old_cols) - len(PAD_COLUMNS) + new_lines = [ + ",".join(old_cols[:insert_at] + ["bias"] + old_cols[insert_at:]) + ] + for line in lines[1:]: + if not line.strip(): + new_lines.append(line) + continue + parts = line.split(",") + parts = parts[:insert_at] + ["False"] + parts[insert_at:] + new_lines.append(",".join(parts)) + with open(untune_file, "w") as f: + f.write("\n".join(new_lines)) with open(untune_file, "a") as f: if os.path.getsize(untune_file) == 0: - f.write( - "token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,hidden_pad,intermediate_pad" - ) + f.write(header) q_dtype_ws = q_dtype_w if q_dtype_w != torch.uint32 else "torch.int4" f.write( - f"\n{token},{model_dim},{inter_dim},{expert},{topk},{activation},{dtype},{q_dtype_a},{q_dtype_ws},{q_type},{int(use_g1u1)},{int(doweight_stage1)},{hidden_pad},{intermediate_pad}" + f"\n{token},{model_dim},{inter_dim},{expert},{topk},{activation},{dtype},{q_dtype_a},{q_dtype_ws},{q_type},{int(use_g1u1)},{int(doweight_stage1)},{bool(bias)},{hidden_pad},{intermediate_pad}" ) logger.info("\033[34m Start tuning fmoe") os.system( @@ -1555,6 +1599,7 @@ def fused_moe_2stages( intermediate_pad, is_shuffled, gate_mode, + bias=(bias1 is not None or bias2 is not None), ) if ( quant_type == QuantType.per_1x32 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 26752718d4..3bc43e7413 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -78,6 +78,9 @@ def mp_lock( AITER_LOG_TUNED_CONFIG = int(os.getenv("AITER_LOG_TUNED_CONFIG", 0)) +PAD_COLUMNS = ("hidden_pad", "intermediate_pad") + + # config_env start here AITER_CONFIG_GEMM_A4W4 = os.getenv( "AITER_CONFIG_GEMM_A4W4", @@ -219,9 +222,7 @@ def update_config_files(self, file_path: str, merge_name: str): "xbf16": 0, "run_1stage": 0, "ksplit": 0, - "hidden_pad": 0, - "hiddne_pad": 0, - "intermediate_pad": 0, + **{c: 0 for c in PAD_COLUMNS}, } all_cols = list(source_pairs[0][1].columns) for _, df in source_pairs[1:]: @@ -259,7 +260,7 @@ def update_config_files(self, file_path: str, merge_name: str): keys.append("cu_num") if "gfx" in merge_df.columns and "gfx" not in keys: keys.append("gfx") - for pad_col in ("hidden_pad", "hiddne_pad", "intermediate_pad"): + for pad_col in PAD_COLUMNS: if pad_col in merge_df.columns and pad_col not in keys: keys.append(pad_col) dedup_keys = keys + ["_tag"] if has_tag else keys diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index f2a40f9c92..dc3014d9fa 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -116,6 +116,32 @@ def cosine_diff_compare(ref, res, msg="", printLog=True): return cos_diff if cos_diff >= COS_DIFF_THRESHOLD else 0.0 +def _untuned_to_bool(value): + """Parse a bool from an untuned CSV cell. Accepts True/False/0/1/empty.""" + if value is None or (isinstance(value, float) and value != value): # NaN + return False + if isinstance(value, (bool, int)): + return bool(value) + s = str(value).strip().lower() + if s in ("", "false", "0", "no", "n", "f"): + return False + if s in ("true", "1", "yes", "y", "t"): + return True + raise ValueError(f"cannot parse bool from {value!r}") + + +def _untuned_to_int(value): + """Parse an int from an untuned CSV cell. Empty/NaN -> 0.""" + if value is None or (isinstance(value, float) and value != value): # NaN + return 0 + if isinstance(value, (int, bool)): + return int(value) + s = str(value).strip() + if s == "": + return 0 + return int(float(s)) + + class FmoeTuner(TunerCommon): ARG_DEFAULTS = { **TunerCommon.ARG_DEFAULTS, @@ -1719,6 +1745,9 @@ def calculate(self, results, bpes=(1, 1, 2)): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = key if us == self.INVALID_TIME or us == self.INF_TIME: return 0, 0 @@ -1829,6 +1858,9 @@ def gen_1stage_asm_task(self, key): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info ## asm moe 1 stage tuning get_gfx() @@ -2004,6 +2036,9 @@ def gen_2stages_asm1_task(self, key, blockMs): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info kernels_list_csv = f"{get_asm_dir()}/fmoe_2stages/fmoe_stage1_bf16_pertoken{{quantDtype}}{{extraInfo}}_g1u1.csv" extraInfo = "" @@ -2115,6 +2150,9 @@ def gen_2stages_task(self, key, blockMs): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info _is_a8w4 = ( @@ -2306,6 +2344,9 @@ def _gen_2stages_task_cktile(self, info, blockMs): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info _gen_data_args_s1 = ( @@ -2424,6 +2465,9 @@ def gen_flydsl_2stages_task(self, info, blockMs): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info if q_type != QuantType.per_1x32 or q_dtype_w != dtypes.fp4x2: @@ -2644,6 +2688,9 @@ def gen_flydsl_i4_2stages_task(self, info, blockMs): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = info if not (q_type == QuantType.per_1x32 and q_dtype_w == dtypes.i4x2): @@ -3073,12 +3120,18 @@ def tune( q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = line dtype = eval(dtype) q_dtype_a = eval(q_dtype_a) q_dtype_w = eval(q_dtype_w) q_type = eval(q_type) q_type = QuantType.per_1x128 if q_type == QuantType.per_128x128 else q_type + bias = _untuned_to_bool(bias) + hidden_pad = _untuned_to_int(hidden_pad) + intermediate_pad = _untuned_to_int(intermediate_pad) print("\nStart tuning", line) if get_gfx() not in ["gfx950"] and q_type in [aiter.QuantType.per_1x32]: print(f"{q_type} is not supported on {get_gfx()}") @@ -3101,6 +3154,9 @@ def tune( q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) tasks.extend(self.gen_2stages_asm1_task(info, blockMs)) tasks_ck.extend(self.gen_2stages_task(info, blockMs)) @@ -3243,6 +3299,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, ) = key import re @@ -3270,6 +3329,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, block_m, row_ksplit, us, @@ -3643,6 +3705,22 @@ def _act_to_fp8(x): else: return pd.DataFrame() + # Optional untuned columns: backfilled with these defaults so older untuned + # CSVs (without bias / hidden_pad / intermediate_pad) still load cleanly. + OPTIONAL_UNTUNED_DEFAULTS = { + "bias": False, + "hidden_pad": 0, + "intermediate_pad": 0, + } + + def _backfill_optional_untuned_cols(self, df): + for col, default in self.OPTIONAL_UNTUNED_DEFAULTS.items(): + if col not in df.columns: + df[col] = default + else: + df[col] = df[col].fillna(default) + return df + def pre_process(self, args): if args.all: self.get_retune_gemm_list(args) @@ -3665,6 +3743,8 @@ def pre_process(self, args): self.tunedf[untunedf_cols].apply(tuple, axis=1) ) self.untunedf = self.untunedf[~mask] + if self.untunedf is not None: + self.untunedf = self._backfill_optional_untuned_cols(self.untunedf) if __name__ == "__main__": @@ -3682,6 +3762,9 @@ def pre_process(self, args): "q_type", "use_g1u1", "doweight_stage1", + "bias", + "hidden_pad", + "intermediate_pad", ] resultList = [ "block_m", diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index b9639e2eb5..bab81a8f63 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -64,6 +64,7 @@ def test_fmoe( doweight_stage1=False, hidden_pad=0, intermediate_pad=0, + bias=False, preshuffle=True, strict_accuracy=True, check_aot_cache=True, @@ -80,16 +81,28 @@ def test_fmoe( if intermediate_pad != 0: w1[:, -intermediate_pad:, :] = 0 w1[:, inter_dim - intermediate_pad : inter_dim, :] = 0 - exp_bias1 = torch.clamp(torch.randn((E, inter_dim * 2), dtype=dtype), -1.0, 1.0) + exp_bias1 = ( + torch.clamp(torch.randn((E, inter_dim * 2), dtype=dtype), -1.0, 1.0) + if bias + else None + ) else: w1 = torch.randn((E, inter_dim, model_dim), dtype=dtype) - exp_bias1 = torch.clamp(torch.randn((E * inter_dim), dtype=dtype), -1.0, 1.0) + exp_bias1 = ( + torch.clamp(torch.randn((E * inter_dim), dtype=dtype), -1.0, 1.0) + if bias + else None + ) w2 = torch.randn((E, model_dim, inter_dim), dtype=dtype) if intermediate_pad != 0: w2[:, :, -intermediate_pad:] = 0 if hidden_pad != 0: w2[:, -hidden_pad:, :] = 0 - exp_bias2 = torch.clamp(torch.randn((E, model_dim), dtype=dtype), -1.0, 1.0) + exp_bias2 = ( + torch.clamp(torch.randn((E, model_dim), dtype=dtype), -1.0, 1.0) + if bias + else None + ) if AITER_MOE_EXPERT_BALANCE: score = torch.zeros((token, E), dtype=dtype) start_col = 0 @@ -179,22 +192,24 @@ def weight_per_128x128_quant(weight, quant_dtype): else: a1_qt, a1_scale = torch_quant(input, quant_dtype=AQDType) - # bias dtype convert - if ( + # bias dtype convert: presence is governed by the `bias` flag (from csv). + # Only the a16w4 quant path (per_1x32, fp4 weight, bf16/fp16/fp8 input) has + # a kernel that accepts bias today; other paths (a4w4, a16wi4, ...) drop it + # to None to match historical test behavior. TODO: extend once the + # corresponding kernels expose a bias-aware variant. + if exp_bias1 is None: + exp_bias1_aiter = None + exp_bias2_aiter = None + elif ( qType == aiter.QuantType.per_1x32 and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) and (WQDType == dtypes.fp4x2) - ): # a16w4 + ): # a16w4: kernel expects fp32 bias exp_bias1_aiter = exp_bias1.to(dtypes.fp32) exp_bias2_aiter = exp_bias2.to(dtypes.fp32) - elif ( - qType == aiter.QuantType.per_1x32 and WQDType == dtypes.i4x2 - ): # a16wi4: no bias - exp_bias1_aiter = exp_bias1 = None - exp_bias2_aiter = exp_bias2 = None else: - exp_bias1_aiter = exp_bias1 = None - exp_bias2_aiter = exp_bias2 = None + exp_bias1 = exp_bias1_aiter = None + exp_bias2 = exp_bias2_aiter = None # pre-shuffle w1_scale_aiter = w1_scale @@ -279,6 +294,7 @@ def weight_per_128x128_quant(weight, quant_dtype): getattr(w1_qt_aiter, "is_shuffled", False) or getattr(w2_qt_aiter, "is_shuffled", False), gateMode, + bias=exp_bias1_aiter is not None, ) if metadata.fuse_quant == "fp4": # Fused Swiglu MXFP4 quantizes the f32 activation directly. @@ -583,15 +599,13 @@ def _str2enum(s, enum_cls): def _row_to_kwargs(row): - def _row_int(name, *aliases): - for key in (name, *aliases): - if key not in row: - continue - value = row.get(key) - if pd.isna(value) or str(value).strip() == "": - return 0 - return int(value) - return 0 + def _row_int(name): + if name not in row: + return 0 + value = row.get(name) + if pd.isna(value) or str(value).strip() == "": + return 0 + return int(value) q_type = _str2enum(row["q_type"], aiter.QuantType) aq_dtype = _str2dtype(row["q_dtype_a"]) @@ -614,8 +628,9 @@ def _row_int(name, *aliases): WQDType=wq_dtype, use_g1u1=dtypes.str2bool(str(row["use_g1u1"])), doweight_stage1=dtypes.str2bool(str(row["doweight_stage1"])), - hidden_pad=_row_int("hidden_pad", "hiddne_pad"), + hidden_pad=_row_int("hidden_pad"), intermediate_pad=_row_int("intermediate_pad"), + bias=dtypes.str2bool(str(row.get("bias", "False"))), preshuffle=True, ) @@ -769,6 +784,7 @@ def _kw( aiter.ActivationType.Swiglu, hidden_pad=hidden_pad, intermediate_pad=intermediate_pad, + bias=True, ), extras elif triple == _PER1X32_FP4_FP4: for preshuffle in args.preshuffle: From b27c4cf9c261111671ec96e2a60678a832bf5b70 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 08:27:01 +0000 Subject: [PATCH 06/11] fix(moe): propagate bias/pad columns through tuner and runtime cfg lookup Plumb bias, hidden_pad and intermediate_pad end-to-end so MoE configs sourced from untuned CSVs survive tuning, AOT precompile and runtime lookup without manual CSV edits or post-hoc backfills. - gemm_moe_tune: keep the three columns explicit when unpacking untuned rows and emitting tuned rows (including the no-candidate fallback); add them to the stage1/stage2 merge keys so the joined row carries them instead of producing _x/_y duplicates with NaN. - fused_moe: drop the PAD_COLUMNS shim, treat bias/pad as first-class lookup keys, and write all three to the online-tune untuned header. - aot/flydsl/moe: trust the bias column from the tuner output instead of the quant-type heuristic so only the requested variant gets precompiled. - jit/core: inline the pad column names now that the shim is gone. Signed-off-by: root --- aiter/aot/flydsl/moe.py | 16 +------- aiter/fused_moe.py | 33 +++------------- aiter/jit/core.py | 8 ++-- .../gemm_moe_tune.py | 38 +++++-------------- 4 files changed, 19 insertions(+), 76 deletions(-) diff --git a/aiter/aot/flydsl/moe.py b/aiter/aot/flydsl/moe.py index 257b56e644..8bd96fc984 100644 --- a/aiter/aot/flydsl/moe.py +++ b/aiter/aot/flydsl/moe.py @@ -131,22 +131,8 @@ def parse_csv(csv_path: str): if act_type.strip().split(".")[-1].lower() == "swiglu" else "silu" ) - q_type = row.get("q_type", "") - dtype = row.get("dtype", "") - q_dtype_w = row.get("q_dtype_w", "") swiglu_limit = _row_swiglu_limit(row) - bias_supported = ( - q_type.strip().split(".")[-1] == "per_1x32" - and dtype in ("torch.bfloat16", "torch.float16") - and "float4_e2m1fn_x2" in q_dtype_w - ) - bias = _row_optional_bool(row, "bias") - if bias is not None: - enable_bias_options = [bias] - elif bias_supported: - enable_bias_options = [False, True] - else: - enable_bias_options = [False] + enable_bias_options = [bool(_row_optional_bool(row, "bias"))] # Detect stage1's fuse_quant from kernel suffix to align stage2's # a2_scale shape with what runtime actually passes. diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 12494b75d9..26ce800fd5 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -13,14 +13,7 @@ from aiter import ActivationType, QuantType, dtypes from aiter import get_hip_quant as get_quant from aiter import logger -from aiter.jit.core import ( - AITER_CONFIGS, - AITER_CSRC_DIR, - PAD_COLUMNS, - PY, - bd_dir, - mp_lock, -) +from aiter.jit.core import AITER_CONFIGS, AITER_CSRC_DIR, PY, bd_dir, mp_lock from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard from aiter.ops.flydsl.utils import is_flydsl_available @@ -890,30 +883,16 @@ def get_2stage_cfgs( "use_g1u1", "doweight_stage1", "bias", - *PAD_COLUMNS, + "hidden_pad", + "intermediate_pad", ] - _BIAS_TRUE = {"true", "1", "yes", "y", "t"} - - def _parse_bias_cell(v): - if v is None: - return False - if isinstance(v, bool): - return v - if isinstance(v, (int, float)): - # NaN -> False - return False if v != v else bool(v) - return str(v).strip().lower() in _BIAS_TRUE - def _normalize_lookup_cols(df): - for col in PAD_COLUMNS: + for col in ("hidden_pad", "intermediate_pad"): if col not in df.columns: df[col] = 0 df[col] = df[col].fillna(0).astype(int) - if "bias" not in df.columns: - df["bias"] = False - else: - df["bias"] = df["bias"].map(_parse_bias_cell).astype(bool) + df["bias"] = df["bias"].eq("True") if "bias" in df.columns else False return df def get_cfg_2stages(tune_file): @@ -1040,7 +1019,7 @@ def MainFunc(): try: insert_at = old_cols.index("doweight_stage1") + 1 except ValueError: - insert_at = len(old_cols) - len(PAD_COLUMNS) + insert_at = len(old_cols) - 2 new_lines = [ ",".join(old_cols[:insert_at] + ["bias"] + old_cols[insert_at:]) ] diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 3bc43e7413..e9377be61e 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -78,9 +78,6 @@ def mp_lock( AITER_LOG_TUNED_CONFIG = int(os.getenv("AITER_LOG_TUNED_CONFIG", 0)) -PAD_COLUMNS = ("hidden_pad", "intermediate_pad") - - # config_env start here AITER_CONFIG_GEMM_A4W4 = os.getenv( "AITER_CONFIG_GEMM_A4W4", @@ -222,7 +219,8 @@ def update_config_files(self, file_path: str, merge_name: str): "xbf16": 0, "run_1stage": 0, "ksplit": 0, - **{c: 0 for c in PAD_COLUMNS}, + "hidden_pad": 0, + "intermediate_pad": 0, } all_cols = list(source_pairs[0][1].columns) for _, df in source_pairs[1:]: @@ -260,7 +258,7 @@ def update_config_files(self, file_path: str, merge_name: str): keys.append("cu_num") if "gfx" in merge_df.columns and "gfx" not in keys: keys.append("gfx") - for pad_col in PAD_COLUMNS: + for pad_col in ("hidden_pad", "intermediate_pad"): if pad_col in merge_df.columns and pad_col not in keys: keys.append(pad_col) dedup_keys = keys + ["_tag"] if has_tag else keys diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index dc3014d9fa..9173240f11 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -116,32 +116,6 @@ def cosine_diff_compare(ref, res, msg="", printLog=True): return cos_diff if cos_diff >= COS_DIFF_THRESHOLD else 0.0 -def _untuned_to_bool(value): - """Parse a bool from an untuned CSV cell. Accepts True/False/0/1/empty.""" - if value is None or (isinstance(value, float) and value != value): # NaN - return False - if isinstance(value, (bool, int)): - return bool(value) - s = str(value).strip().lower() - if s in ("", "false", "0", "no", "n", "f"): - return False - if s in ("true", "1", "yes", "y", "t"): - return True - raise ValueError(f"cannot parse bool from {value!r}") - - -def _untuned_to_int(value): - """Parse an int from an untuned CSV cell. Empty/NaN -> 0.""" - if value is None or (isinstance(value, float) and value != value): # NaN - return 0 - if isinstance(value, (int, bool)): - return int(value) - s = str(value).strip() - if s == "": - return 0 - return int(float(s)) - - class FmoeTuner(TunerCommon): ARG_DEFAULTS = { **TunerCommon.ARG_DEFAULTS, @@ -3129,9 +3103,9 @@ def tune( q_dtype_w = eval(q_dtype_w) q_type = eval(q_type) q_type = QuantType.per_1x128 if q_type == QuantType.per_128x128 else q_type - bias = _untuned_to_bool(bias) - hidden_pad = _untuned_to_int(hidden_pad) - intermediate_pad = _untuned_to_int(intermediate_pad) + bias = bool(bias) + hidden_pad = int(hidden_pad) + intermediate_pad = int(intermediate_pad) print("\nStart tuning", line) if get_gfx() not in ["gfx950"] and q_type in [aiter.QuantType.per_1x32]: print(f"{q_type} is not supported on {get_gfx()}") @@ -3444,6 +3418,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False): "q_type", "use_g1u1", "doweight_stage1", + "bias", + "hidden_pad", + "intermediate_pad", "block_m", ], how="inner", @@ -3471,6 +3448,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False): q_type, use_g1u1, doweight_stage1, + bias, + hidden_pad, + intermediate_pad, 0, 0, self.INVALID_TIME, From 2aa9576442be163abf1e8a1b162777f6d5a494a6 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 08:42:32 +0000 Subject: [PATCH 07/11] refactor(moe): simplify bias/pad column handling in csv merge - aot/flydsl/moe: inline pad/bias parsing with .get() defaults, matching the surrounding block_m/cu_num style and dropping the two over-engineered helpers. - jit/core: collapse _FILL_DEFAULTS to the only non-zero entry (bias=False); add bias to the merge dedup-key fallback so legacy untuned CSVs without the column still treat bias=True/False rows as distinct. Signed-off-by: root --- aiter/aot/flydsl/moe.py | 34 +++------------------------------- aiter/jit/core.py | 9 ++++----- 2 files changed, 7 insertions(+), 36 deletions(-) diff --git a/aiter/aot/flydsl/moe.py b/aiter/aot/flydsl/moe.py index 8bd96fc984..178ccc0d68 100644 --- a/aiter/aot/flydsl/moe.py +++ b/aiter/aot/flydsl/moe.py @@ -71,34 +71,6 @@ def _row_swiglu_limit(row: dict[str, str]) -> float: return _parse_optional_float(row.get("swiglu_limit"), "swiglu_limit") or 0.0 -def _row_optional_int(row: dict[str, str], name: str) -> int: - value = row.get(name) - if value is None: - return 0 - value = str(value).strip() - if value == "": - return 0 - try: - return int(value) - except ValueError as e: - raise ValueError(f"{name} must be an int, got {value!r}") from e - - -def _row_optional_bool(row: dict[str, str], name: str) -> bool | None: - value = row.get(name) - if value is None: - return None - value = str(value).strip() - if value == "": - return None - value_lower = value.lower() - if value_lower in ("1", "true", "t", "yes", "y"): - return True - if value_lower in ("0", "false", "f", "no", "n"): - return False - raise ValueError(f"{name} must be a bool, got {value!r}") - - def parse_csv(csv_path: str): """Parse the CSV and return a list of unique compile jobs. @@ -121,8 +93,8 @@ def parse_csv(csv_path: str): experts = int(row["expert"]) topk = int(row["topk"]) doweight_stage1 = bool(int(row.get("doweight_stage1", "0"))) - hidden_pad = _row_optional_int(row, "hidden_pad") - intermediate_pad = _row_optional_int(row, "intermediate_pad") + hidden_pad = int(row.get("hidden_pad", "0") or "0") + intermediate_pad = int(row.get("intermediate_pad", "0") or "0") cu_num = int(row.get("cu_num", "0")) block_m = int(row.get("block_m", "0") or "0") act_type = row.get("act_type", "") @@ -132,7 +104,7 @@ def parse_csv(csv_path: str): else "silu" ) swiglu_limit = _row_swiglu_limit(row) - enable_bias_options = [bool(_row_optional_bool(row, "bias"))] + enable_bias_options = [str(row.get("bias", "")).strip() == "True"] # Detect stage1's fuse_quant from kernel suffix to align stage2's # a2_scale shape with what runtime actually passes. diff --git a/aiter/jit/core.py b/aiter/jit/core.py index e9377be61e..0637139ec8 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -219,8 +219,7 @@ def update_config_files(self, file_path: str, merge_name: str): "xbf16": 0, "run_1stage": 0, "ksplit": 0, - "hidden_pad": 0, - "intermediate_pad": 0, + "bias": False, } all_cols = list(source_pairs[0][1].columns) for _, df in source_pairs[1:]: @@ -258,9 +257,9 @@ def update_config_files(self, file_path: str, merge_name: str): keys.append("cu_num") if "gfx" in merge_df.columns and "gfx" not in keys: keys.append("gfx") - for pad_col in ("hidden_pad", "intermediate_pad"): - if pad_col in merge_df.columns and pad_col not in keys: - keys.append(pad_col) + for col in ("bias", "hidden_pad", "intermediate_pad"): + if col in merge_df.columns and col not in keys: + keys.append(col) dedup_keys = keys + ["_tag"] if has_tag else keys duplicated_mask = merge_df.duplicated(subset=dedup_keys, keep=False) if duplicated_mask.any(): From 3122cdf3cd2ba9e5a4e48c28a1891f209a579470 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 09:23:09 +0000 Subject: [PATCH 08/11] fix(moe): correct bias normalization and a4w4 bias handling pandas auto-infers True/False string columns as bool dtype; .eq("True") then compares bool to str and always returns False, causing all tuned rows to lookup as bias=False and miss the primary index. Normalize via astype(str) to cover both inferred-bool and string cases. In the 2stage test, the csv `bias` flag is the source of truth: cast to fp32 when set, instead of dropping to None for non-a16w4 quant paths. This keeps runtime enable_bias aligned with the AOT-precompiled variant and prevents AOT cache miss for a4w4 (gpt-oss) configs. Signed-off-by: root --- aiter/fused_moe.py | 5 ++++- op_tests/test_moe_2stage.py | 17 ++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 26ce800fd5..ecf29c5e4f 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -892,7 +892,10 @@ def _normalize_lookup_cols(df): if col not in df.columns: df[col] = 0 df[col] = df[col].fillna(0).astype(int) - df["bias"] = df["bias"].eq("True") if "bias" in df.columns else False + if "bias" in df.columns: + df["bias"] = df["bias"].astype(str).str.strip().eq("True") + else: + df["bias"] = False return df def get_cfg_2stages(tune_file): diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index bb74af969d..8fa7bdb4fd 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -210,24 +210,15 @@ def weight_per_128x128_quant(weight, quant_dtype): else: a1_qt, a1_scale = torch_quant(input, quant_dtype=AQDType) - # bias dtype convert: presence is governed by the `bias` flag (from csv). - # Only the a16w4 quant path (per_1x32, fp4 weight, bf16/fp16/fp8 input) has - # a kernel that accepts bias today; other paths (a4w4, a16wi4, ...) drop it - # to None to match historical test behavior. TODO: extend once the - # corresponding kernels expose a bias-aware variant. + # bias dtype convert: `bias` flag (from csv) is the source of truth. When + # set, cast to fp32 (kernel ABI). When csv has no bias column, exp_bias1 + # is already None (default False) and this is a no-op. if exp_bias1 is None: exp_bias1_aiter = None exp_bias2_aiter = None - elif ( - qType == aiter.QuantType.per_1x32 - and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) - and (WQDType == dtypes.fp4x2) - ): # a16w4: kernel expects fp32 bias + else: exp_bias1_aiter = exp_bias1.to(dtypes.fp32) exp_bias2_aiter = exp_bias2.to(dtypes.fp32) - else: - exp_bias1 = exp_bias1_aiter = None - exp_bias2 = exp_bias2_aiter = None # pre-shuffle w1_scale_aiter = w1_scale From 6d79e65d28162b6eeab049c71f053a6ec07d7fbb Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 12:09:07 +0000 Subject: [PATCH 09/11] update --- .../model_configs/gptoss_fp4_tuned_fmoe.csv | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index 22a2058b40..3f32cec7eb 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -1,25 +1,25 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,bias,hidden_pad,intermediate_pad,tflops,bw,_tag -256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,True,192,192,182.57,11418.01, -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,True,192,192,356.87,11166.78, -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,True,192,192,703.24,11016.73, -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,True,192,192,1052.15,8262.71, -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,True,192,192,1508.93,5955.64, -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,True,192,192,948.24,1890.61, -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,True,192,192,1384.38,1408.26, -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,True,192,192,1665.09,880.78, -256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,True,192,96,181.84,11379.53, -256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,True,192,96,346.54,10857.58, -256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,True,192,96,643.71,10110.43, -256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,True,192,96,956.4,7549.71, -256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,True,192,96,1246.25,4969.6, -256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,True,192,96,1520.68,3093.83, -256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,True,192,96,1769.69,1872.23, -256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,True,192,96,1943.37,1107.06, -256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,True,192,152,144.56,9070.37, -256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,True,192,152,266.72,8400.06, -256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,True,192,152,473.78,7518.51, -256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,True,192,152,699.93,5639.12, -256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,True,192,152,897.88,3726.57, -256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,True,192,152,1082.99,2379.61, -256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,True,192,152,1257.9,1535.52, -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,True,192,152,1397.92,1023.87, +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,398.1113,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,532.1537,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.6%,930.265,0,0,1329.68,973.88, +256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,194.4934,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.8%,107.0083,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,301.5017,0,0,192.31,12027.26, +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,201.4751,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,112.9308,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,5.0%,314.4059,0,0,368.84,11541.12, +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,210.677,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,116.4949,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,327.1719,0,0,708.89,11105.22, +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,257.677,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_fp4,0.8%,152.703,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,410.38,0,0,1130.31,8876.54, +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,401.2668,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,207.587,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,608.8538,0,0,1523.7,6013.97, +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,616.8445,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,450.2309,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,3.1%,1067.0754,0,0,1738.8,3466.84, +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,1074.0444,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,782.384,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,3.1%,1856.4284,0,0,1998.92,2033.41, +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,2004.8282,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,1463.8338,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,3.1%,3468.662,0,0,2139.64,1131.81, +256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,100.2431,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,58.3019,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,3.7%,158.545,0,0,182.86,11443.43, +256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,101.6388,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,60.53,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,3.7%,162.1688,0,0,357.54,11202.27, +256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,105.8889,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,67.0223,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,3.6%,172.9112,0,0,670.66,10533.59, +256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,149.9964,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,86.1496,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,236.146,0,0,982.14,7752.89, +256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,228.0148,flydsl_moe1_afp4_wfp4_bf16_t128x64x256_w2_bnt0,0.0%,142.5471,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,3.6%,370.5619,0,0,1251.77,4991.58, +256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,342.1093,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,267.9577,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.067,0,0,1520.67,3093.82, +256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,605.1533999999999,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,450.0292,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1055.1826,0,0,1758.39,1860.28, +256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,1068.8735,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,842.2079,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1911.0814,0,0,1941.75,1106.14, +256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,32,0,36.5525,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.8%,27.8074,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist,1.5%,64.3599,0,0,150.15,9421.07, +256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,38.4215,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,0.8%,30.9836,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,1.6%,69.4051,0,0,278.47,8770.23, +256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,44.9936,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4,0.0%,37.9515,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_sbm64,1.6%,82.9451,0,0,466.03,7395.46, +256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,60.532900000000005,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.2928,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,111.8257,0,0,691.34,5569.87, +256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,82.8044,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,91.2517,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.6%,174.0561,0,0,888.33,3686.91, +256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,122.4926,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,162.8753,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,285.3679,0,0,1083.65,2381.06, +256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,211.1064,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,288.3477,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,499.4541,0,0,1238.3,1511.6, From c4b787a7589d0f8965508fcdaf41340d1eb81b2b Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 14:31:36 +0000 Subject: [PATCH 10/11] update --- aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index 3f32cec7eb..fc3670f4bf 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -1,5 +1,4 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,398.1113,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,532.1537,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.6%,930.265,0,0,1329.68,973.88, 256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,194.4934,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.8%,107.0083,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,301.5017,0,0,192.31,12027.26, 256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,201.4751,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,112.9308,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,5.0%,314.4059,0,0,368.84,11541.12, 256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,210.677,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,116.4949,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,327.1719,0,0,708.89,11105.22, @@ -23,3 +22,4 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,82.8044,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,91.2517,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.6%,174.0561,0,0,888.33,3686.91, 256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,122.4926,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,162.8753,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,285.3679,0,0,1083.65,2381.06, 256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,211.1064,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,288.3477,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,499.4541,0,0,1238.3,1511.6, +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,398.1113,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,532.1537,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.6%,930.265,0,0,1329.68,973.88, \ No newline at end of file From 44405f8b788feab40a003a0da18494223c9a3a26 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 25 May 2026 16:03:22 +0000 Subject: [PATCH 11/11] update --- .../model_configs/gptoss_fp4_tuned_fmoe.csv | 48 +++++++++---------- .../gptoss_fp8fp4_tuned_fmoe.csv | 30 ++++++------ .../gptoss_fp8fp4_untuned_fmoe.csv | 16 +++---- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv index fc3670f4bf..cc26a91d7e 100644 --- a/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv @@ -1,25 +1,25 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag -256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,194.4934,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.8%,107.0083,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,301.5017,0,0,192.31,12027.26, -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,201.4751,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,112.9308,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,5.0%,314.4059,0,0,368.84,11541.12, -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,210.677,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,116.4949,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,327.1719,0,0,708.89,11105.22, -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,257.677,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_fp4,0.8%,152.703,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,410.38,0,0,1130.31,8876.54, -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,401.2668,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,207.587,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,608.8538,0,0,1523.7,6013.97, -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,616.8445,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,450.2309,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,3.1%,1067.0754,0,0,1738.8,3466.84, -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,1074.0444,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,782.384,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,3.1%,1856.4284,0,0,1998.92,2033.41, -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,2004.8282,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,1463.8338,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,3.1%,3468.662,0,0,2139.64,1131.81, -256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,100.2431,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,58.3019,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,3.7%,158.545,0,0,182.86,11443.43, -256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,101.6388,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,60.53,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,3.7%,162.1688,0,0,357.54,11202.27, -256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,105.8889,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,67.0223,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,3.6%,172.9112,0,0,670.66,10533.59, -256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,149.9964,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,86.1496,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,236.146,0,0,982.14,7752.89, -256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,228.0148,flydsl_moe1_afp4_wfp4_bf16_t128x64x256_w2_bnt0,0.0%,142.5471,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,3.6%,370.5619,0,0,1251.77,4991.58, -256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,342.1093,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,267.9577,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.067,0,0,1520.67,3093.82, -256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,605.1533999999999,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,450.0292,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1055.1826,0,0,1758.39,1860.28, -256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,1068.8735,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,842.2079,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1911.0814,0,0,1941.75,1106.14, -256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,32,0,36.5525,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.8%,27.8074,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist,1.5%,64.3599,0,0,150.15,9421.07, -256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,38.4215,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,0.8%,30.9836,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,1.6%,69.4051,0,0,278.47,8770.23, -256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,44.9936,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4,0.0%,37.9515,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_sbm64,1.6%,82.9451,0,0,466.03,7395.46, -256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,60.532900000000005,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.2928,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,111.8257,0,0,691.34,5569.87, -256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,82.8044,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,91.2517,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.6%,174.0561,0,0,888.33,3686.91, -256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,122.4926,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,162.8753,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,285.3679,0,0,1083.65,2381.06, -256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,211.1064,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,288.3477,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,499.4541,0,0,1238.3,1511.6, -256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,398.1113,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,532.1537,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_persist,0.6%,930.265,0,0,1329.68,973.88, \ No newline at end of file +256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,182.57,11418.01, +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,356.87,11166.78, +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,703.24,11016.73, +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,1052.15,8262.71, +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,1508.93,5955.64, +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,948.24,1890.61, +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,1384.38,1408.26, +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,1665.09,880.78, +256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,181.84,11379.53, +256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,346.54,10857.58, +256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,643.71,10110.43, +256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,956.4,7549.71, +256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,1246.25,4969.6, +256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,1520.68,3093.83, +256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,1769.69,1872.23, +256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,96,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,1943.37,1107.06, +256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,144.56,9070.37, +256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,266.72,8400.06, +256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,473.78,7518.51, +256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,699.93,5639.12, +256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,897.88,3726.57, +256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,1082.99,2379.61, +256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,1257.9,1535.52, +256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,152,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,1397.92,1023.87, diff --git a/aiter/configs/model_configs/gptoss_fp8fp4_tuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp8fp4_tuned_fmoe.csv index 132998440c..90910b9755 100644 --- a/aiter/configs/model_configs/gptoss_fp8fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp8fp4_tuned_fmoe.csv @@ -1,15 +1,15 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,214.3544,flydsl_moe1_afp8_wfp4_bf16_t32x128x256_w2_gui_fp8,0.0%,111.631,flydsl_moe2_afp8_wfp4_bf16_t32x256x256_atomic_bnt2_persist,0.0%,325.9854,0,355.73,11131.16, -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,235.2077,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_gui_fp8,0.0%,125.5088,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_bnt2,0.0%,360.7165,0,642.97,10072.5, -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,312.5584,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,172.1029,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist,0.0%,484.6613,0,957.07,7516.08, -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,442.3352,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,256.1523,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist_sbm128,0.0%,698.4875,0,1328.17,5242.22, -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,714.6281,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_bnt0_gui_fp8,0.0%,413.5452,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_persist_sbm128,0.0%,1128.1733,0,1644.63,3279.08, -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1356.5778,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,731.3886,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_sbm128,0.0%,2087.9664,0,1777.26,1807.92, -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,2474.7814,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,1348.5732,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_xcd4_persist,0.0%,3823.3546,0,1941.15,1026.81, -256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,527.0715,cktile_a8w4_bm32,0.0,117.3402,cktile_a8w4_bm32,0.0,644.4117,0,0.0,0.0,flydsl_fallback -256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,286.4048,cktile_a8w4_bm32,0.0,142.6674,cktile_a8w4_bm32,0.0,429.0722,0,0.0,0.0,flydsl_fallback -256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,446.6267,cktile_a8w4_bm32,0.0,181.4069,cktile_a8w4_bm32,0.0,628.0336,0,0.0,0.0,flydsl_fallback -256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,786.0193,cktile_a8w4_bm32,0.0,275.9191,cktile_a8w4_bm32,0.0,1061.9384,0,0.0,0.0,flydsl_fallback -256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,1478.4233,cktile_a8w4_bm32,0.0,480.9397,cktile_a8w4_bm32,0.0,1959.363,0,0.0,0.0,flydsl_fallback -256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,2752.7649,cktile_a8w4_bm32,0.0,908.23,cktile_a8w4_bm32,0.0,3660.9949,0,0.0,0.0,flydsl_fallback -256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,5411.816400000001,cktile_a8w4_bm32,0.0,1750.1288,cktile_a8w4_bm32,0.0,7161.9452,0,0.0,0.0,flydsl_fallback +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,214.3544,flydsl_moe1_afp8_wfp4_bf16_t32x128x256_w2_gui_fp8,0.0%,111.631,flydsl_moe2_afp8_wfp4_bf16_t32x256x256_atomic_bnt2_persist,0.0%,325.9854,0,355.73,11131.16, +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,235.2077,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_gui_fp8,0.0%,125.5088,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_bnt2,0.0%,360.7165,0,642.97,10072.5, +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,312.5584,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,172.1029,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist,0.0%,484.6613,0,957.07,7516.08, +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,442.3352,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,256.1523,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist_sbm128,0.0%,698.4875,0,1328.17,5242.22, +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,714.6281,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_bnt0_gui_fp8,0.0%,413.5452,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_persist_sbm128,0.0%,1128.1733,0,1644.63,3279.08, +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,128,0,1356.5778,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,731.3886,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_sbm128,0.0%,2087.9664,0,1777.26,1807.92, +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,64,0,2474.7814,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,1348.5732,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_xcd4_persist,0.0%,3823.3546,0,1941.15,1026.81, +256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,527.0715,cktile_a8w4_bm32,0.0,117.3402,cktile_a8w4_bm32,0.0,644.4117,0,0.0,0.0,flydsl_fallback +256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,286.4048,cktile_a8w4_bm32,0.0,142.6674,cktile_a8w4_bm32,0.0,429.0722,0,0.0,0.0,flydsl_fallback +256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,446.6267,cktile_a8w4_bm32,0.0,181.4069,cktile_a8w4_bm32,0.0,628.0336,0,0.0,0.0,flydsl_fallback +256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,786.0193,cktile_a8w4_bm32,0.0,275.9191,cktile_a8w4_bm32,0.0,1061.9384,0,0.0,0.0,flydsl_fallback +256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,1478.4233,cktile_a8w4_bm32,0.0,480.9397,cktile_a8w4_bm32,0.0,1959.363,0,0.0,0.0,flydsl_fallback +256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,2752.7649,cktile_a8w4_bm32,0.0,908.23,cktile_a8w4_bm32,0.0,3660.9949,0,0.0,0.0,flydsl_fallback +256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192,32,0,5411.816400000001,cktile_a8w4_bm32,0.0,1750.1288,cktile_a8w4_bm32,0.0,7161.9452,0,0.0,0.0,flydsl_fallback diff --git a/aiter/configs/model_configs/gptoss_fp8fp4_untuned_fmoe.csv b/aiter/configs/model_configs/gptoss_fp8fp4_untuned_fmoe.csv index 3bff7e7710..5f285ffeb0 100644 --- a/aiter/configs/model_configs/gptoss_fp8fp4_untuned_fmoe.csv +++ b/aiter/configs/model_configs/gptoss_fp8fp4_untuned_fmoe.csv @@ -1,8 +1,8 @@ -token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 -512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 -32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 \ No newline at end of file +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,bias,hidden_pad,intermediate_pad +512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192 +32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,True,192,192