From 8d7f05d1073ec2feb83425cc062cdca7b588dd72 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 20 May 2026 05:59:22 +0000 Subject: [PATCH] flydsl enable --- .../a8w8_blockscale_tuned_fmoe_ds_v3.csv | 798 ++--- .../a8w8_blockscale_untuned_fmoe_ds_v3.csv | 48 + aiter/fused_moe.py | 7 +- .../flydsl/kernels/moe_blockscale_2stage.py | 2990 +++++++++++++++++ aiter/ops/flydsl/moe_kernels.py | 391 ++- .../gemm_moe_tune.py | 245 ++ 6 files changed, 4093 insertions(+), 386 deletions(-) create mode 100644 aiter/ops/flydsl/kernels/moe_blockscale_2stage.py diff --git a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv index 94ae33ea28..98c29c23ab 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_tuned_fmoe_ds_v3.csv @@ -1,382 +1,416 @@ -cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag -80,2048,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0, -80,4096,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0, -80,8192,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0, -80,16384,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0, -80,32768,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0, -80,2048,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98, -80,4096,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98, -80,8192,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98, -80,16384,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98, -80,32768,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98, -80,2048,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23, -80,4096,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23, -80,8192,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23, -80,16384,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23, -80,32768,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23, -80,2048,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77, -80,4096,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77, -80,8192,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77, -80,16384,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77, -80,32768,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77, -80,2048,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39, -80,4096,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39, -80,8192,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39, -80,16384,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39, -80,32768,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39, -80,2048,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36, -80,4096,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36, -80,8192,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36, -80,16384,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36, -80,32768,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36, -80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52, -80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52, -80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52, -80,16384,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52, -80,32768,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52, -80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71, -80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71, -80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71, -80,16384,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71, -80,32768,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71, -80,1,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,14.7181,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,8.8578,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,23.5759,0,1.6,12809.78, -80,2,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,16.1962,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,10.9481,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,27.1443,0,2.78,11126.26, -80,4,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,22.6085,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,21.8513,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,44.4598,0,3.4,6793.53, -80,8,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,26.7393,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,30.591,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,57.3303,0,5.27,5269.26, -80,16,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,44.9474,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,44.429,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,89.3764,0,6.76,3381.05, -80,32,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,130.6754,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,130.6754,1,9.24,2314.0, -80,40,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,132.1174,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,132.1174,1,11.43,2289.49, -80,64,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,135.0415,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,135.0415,1,17.89,2242.1, -80,128,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,136.8626,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,136.8626,1,35.3,2218.01, -80,256,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,144.1889,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,144.1889,1,67.02,2116.22, -80,512,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,211.5621,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,211.5621,1,91.36,1457.17, -80,1024,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,290.2352,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,290.2352,1,133.18,1083.85, -80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32, -80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32, -80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88, -80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86, -80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86, -80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86, -80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86, -80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39, -80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29, -80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29, -80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29, -80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29, -80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67, -80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48, -80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48, -80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48, -80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48, -80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59, -80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,39.3207,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,12.9446,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,52.2653,0,1.69,26964.5, -80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,40.8689,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,18.1489,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,59.0178,0,2.98,23879.73, -80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,46.2328,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.7%,28.8818,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,75.1146,0,4.69,18762.96, -80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,64.2093,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,1.8%,45.801,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,110.0103,0,6.41,12812.06, -80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,116.5659,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,74.8227,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,191.3886,0,7.36,7365.28, -80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,192.7713,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,6.2%,126.9169,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,319.6882,0,8.82,4410.47, -80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,412.7488,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,412.7488,1,13.66,3417.73, -80,128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,285.3733,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,6.3%,202.3184,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,487.6917,0,23.12,2895.35, -80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,295.5849,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,214.8909,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,510.4758,0,44.17,2771.51, -80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,718.9196,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,718.9196,1,125.46,1990.91, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44, -80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92, -80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92, -80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92, -80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92, -80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92, -80,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,39.59,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,13.014,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,52.604,0,1.88,26895.53, -80,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,41.4577,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,19.353,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,60.8107,0,3.26,23266.2, -80,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,48.5685,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,32.2906,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,80.8591,0,4.9,17498.06, -80,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,72.5315,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,52.4907,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,125.0222,0,6.34,11317.7, -80,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,122.8339,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,81.6731,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,204.507,0,7.75,6919.74, -80,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,201.5918,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,7.5%,129.3064,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,330.8982,0,9.58,4277.69, -80,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,430.9915,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,430.9915,1,14.71,3285.84, -80,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,285.6053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,3.7%,204.5325,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,490.1378,0,25.88,2892.13, -80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,302.4733,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,2.5%,221.0756,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,523.5489,0,48.45,2712.82, -80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,826.1396,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,826.1396,1,122.82,1739.19, -80,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65, -80,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65, -80,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65, -80,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35, -80,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17, -80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7, -80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7, -80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7, -80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7, -80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99, -80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75, -80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75, -80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75, -80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75, -80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75, -80,2048,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78, -80,4096,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78, -80,8192,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78, -80,16384,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78, -80,32768,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78, -80,2048,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26, -80,4096,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26, -80,8192,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26, -80,16384,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26, -80,32768,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26, -256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35, -256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35, -256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35, -256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35, -256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89, -256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89, -256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89, -256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89, -256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89, -256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,80.9637,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,59.914,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,140.8777,0,17.15,4290.05, -256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,84.98,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,66.4795,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,151.4595,0,31.9,3992.92, -256,128,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,88.6975,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,86.5349,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,175.2324,0,55.15,3455.71, -256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,88.5757,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,141.8788,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,230.4545,0,83.87,2634.47, -256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,93.59,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,247.6998,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,341.2898,0,113.26,1788.13, -256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38, -256,1,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0, -256,2,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0, -256,4,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0, -256,8,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0, -256,16,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0, -256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78, -256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78, -256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78, -256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,77.4907,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,77.4907,1,10.23,18259.78, -256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3, -256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,158.4834,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,158.4834,1,17.78,8896.67, -256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,212.9873,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,212.9873,1,26.47,6623.22, -256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,241.6039,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,241.6039,1,46.66,5844.44, -256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,249.5786,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,249.5786,1,90.35,5668.72, -256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,260.9691,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,260.9691,1,172.81,5442.39, -256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61, -256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,278.8859,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,141.0514,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,0.0,419,26.85,6715.17, -256,1,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,45.8865,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,10.9563,moe_ck2stages_gemm2_128x16x128x128_1x2_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,56.8428,0,3.49,49779.46, -256,2,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,62.7228,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,62.7228,1,6.32,45113.19, -256,4,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,78.6512,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,78.6512,1,10.08,35977.43, -256,8,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,128.1741,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,128.1741,1,12.37,22077.43, -256,16,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,211.2661,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,211.2661,1,15.01,13395.08, -256,32,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,218.1954,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,110.1563,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.2%,328.3517,0,19.31,8619.63, -256,64,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,281.8906,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,136.8115,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.0%,418.7021,0,30.29,6761.27, -256,128,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,310.6027,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,161.3951,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,17.0%,471.9978,0,53.74,6000.74, -256,256,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,343.698,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,172.001,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.9%,515.699,0,98.38,5497.56, -256,512,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,358.796,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,181.7047,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.8%,540.5007,0,187.73,5255.48, -256,1024,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,673.6803,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,673.6803,1,301.24,4232.87, -256,2048,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,873.5909,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,873.5909,1,464.6,3289.44, -256,4096,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1294.01,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1294.01,1,627.31,2254.75, -256,8192,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2305.846,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2305.846,1,704.08,1303.53, +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,xbf16,_tag +80,2048,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0,0, +80,4096,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0,0, +80,8192,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0,0, +80,16384,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0,0, +80,32768,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0,0, +80,2048,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98,0, +80,4096,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98,0, +80,8192,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98,0, +80,16384,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98,0, +80,32768,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98,0, +80,2048,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23,0, +80,4096,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23,0, +80,8192,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23,0, +80,16384,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23,0, +80,32768,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23,0, +80,2048,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77,0, +80,4096,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77,0, +80,8192,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77,0, +80,16384,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77,0, +80,32768,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.5184,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,122.9113,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,264.4297,0,73.09,1173.77,0, +80,2048,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39,0, +80,4096,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39,0, +80,8192,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39,0, +80,16384,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39,0, +80,32768,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39,0, +80,2048,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36,0, +80,4096,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36,0, +80,8192,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36,0, +80,16384,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36,0, +80,32768,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36,0, +80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52,0, +80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52,0, +80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52,0, +80,16384,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52,0, +80,32768,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52,0, +80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71,0, +80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71,0, +80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71,0, +80,16384,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71,0, +80,32768,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71,0, +80,1,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,14.7181,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,8.8578,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,23.5759,0,1.6,12809.78,0, +80,2,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,16.1962,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,10.9481,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,27.1443,0,2.78,11126.26,0, +80,4,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,22.6085,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,21.8513,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,44.4598,0,3.4,6793.53,0, +80,8,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,26.7393,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,30.591,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,57.3303,0,5.27,5269.26,0, +80,16,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,44.9474,moe_ck2stages_gemm1_256x16x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,44.429,moe_ck2stages_gemm2_64x16x64x64_1x1_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.1%,89.3764,0,6.76,3381.05,0, +80,32,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,130.6754,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,130.6754,1,9.24,2314.0,0, +80,40,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,132.1174,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,132.1174,1,11.43,2289.49,0, +80,64,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,135.0415,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,135.0415,1,17.89,2242.1,0, +80,128,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,136.8626,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,136.8626,1,35.3,2218.01,0, +80,256,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,144.1889,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,144.1889,1,67.02,2116.22,0, +80,512,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,211.5621,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,211.5621,1,91.36,1457.17,0, +80,1024,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,290.2352,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,290.2352,1,133.18,1083.85,0, +80,2048,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32,0, +80,4096,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32,0, +80,8192,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,553.26,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0.0%,553.26,1,139.73,591.32,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88,0, +80,2048,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86,0, +80,4096,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86,0, +80,8192,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86,0, +80,16384,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86,0, +80,32768,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39,0, +80,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29,0, +80,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29,0, +80,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29,0, +80,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29,0, +80,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,556.9903,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,555.6941,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1112.6844,0,81.06,1286.35,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67,0, +80,2048,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48,0, +80,4096,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48,0, +80,8192,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48,0, +80,16384,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48,0, +80,32768,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59,0, +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,39.3207,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,12.9446,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,52.2653,0,1.69,26964.5,0, +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,40.8689,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,18.1489,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,59.0178,0,2.98,23879.73,0, +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,46.2328,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.7%,28.8818,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,75.1146,0,4.69,18762.96,0, +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,64.2093,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,1.8%,45.801,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,110.0103,0,6.41,12812.06,0, +80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,116.5659,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,74.8227,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,191.3886,0,7.36,7365.28,0, +80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,192.7713,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,6.2%,126.9169,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,319.6882,0,8.82,4410.47,0, +80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,412.7488,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,412.7488,1,13.66,3417.73,0, +80,128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,285.3733,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,6.3%,202.3184,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,487.6917,0,23.12,2895.35,0, +80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,295.5849,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,214.8909,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,510.4758,0,44.17,2771.51,0, +80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,718.9196,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,718.9196,1,125.46,1990.91,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1195.0627,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1195.0627,1,150.94,1216.11,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44,0, +80,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92,0, +80,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92,0, +80,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92,0, +80,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92,0, +80,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92,0, +80,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,39.59,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,13.014,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,52.604,0,1.88,26895.53,0, +80,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,41.4577,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,19.353,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.2%,60.8107,0,3.26,23266.2,0, +80,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,48.5685,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,32.2906,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,80.8591,0,4.9,17498.06,0, +80,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,72.5315,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,52.4907,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,125.0222,0,6.34,11317.7,0, +80,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,122.8339,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,81.6731,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,204.507,0,7.75,6919.74,0, +80,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,201.5918,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,7.5%,129.3064,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,330.8982,0,9.58,4277.69,0, +80,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,430.9915,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,430.9915,1,14.71,3285.84,0, +80,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,285.6053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,3.7%,204.5325,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,490.1378,0,25.88,2892.13,0, +80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,302.4733,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,2.5%,221.0756,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,523.5489,0,48.45,2712.82,0, +80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,826.1396,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,826.1396,1,122.82,1739.19,0, +80,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65,0, +80,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65,0, +80,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,1278.9473,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,1278.9473,1,158.68,1140.65,0, +80,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35,0, +80,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17,0, +80,2048,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7,0, +80,4096,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7,0, +80,8192,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7,0, +80,16384,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7,0, +80,32768,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1027.6111,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,841.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1869.5469,0,96.49,1519.4,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99,0, +80,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75,0, +80,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75,0, +80,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75,0, +80,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75,0, +80,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75,0, +80,2048,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78,0, +80,4096,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78,0, +80,8192,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78,0, +80,16384,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78,0, +80,32768,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78,0, +80,2048,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26,0, +80,4096,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26,0, +80,8192,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26,0, +80,16384,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26,0, +80,32768,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,3226.5114,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,5.3%,0.0,Null,0.0%,3226.5114,1,279.54,457.26,0, +256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35,0, +256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35,0, +256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35,0, +256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35,0, +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89,0, +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89,0, +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89,0, +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89,0, +256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89,0, +256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,80.9637,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,59.914,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,140.8777,0,17.15,4290.05,0, +256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,84.98,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,66.4795,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,151.4595,0,31.9,3992.92,0, +256,128,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,88.6975,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,86.5349,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,175.2324,0,55.15,3455.71,0, +256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,88.5757,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,141.8788,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,230.4545,0,83.87,2634.47,0, +256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,93.59,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,247.6998,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,341.2898,0,113.26,1788.13,0, +256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,2048,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,4096,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,8192,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,16384,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,32768,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,126.8404,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,473.289,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,600.1294,0,128.82,1027.38,0, +256,1,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0,0, +256,2,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0,0, +256,4,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0,0, +256,8,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0,0, +256,16,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0,0, +256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,23.7323,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7,0.0%,12.7284,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_atomic_w3,0.6%,36.4607,0,0.0,2.72,38803, +256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,32.6176,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7_w4,0.4%,14.0722,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_reduce,0.3%,46.6898,0,0.0,4.24,30302, +256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,38.9565,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7_w3,0.4%,19.1858,flydsl_moe2_afp8_wfp8_bf16_t16x256x256_qbs_reduce_w3,0.2%,58.1423,0,0.0,6.82,24334, +256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,75.4008,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,75.4008,1,1.0,10.51,18765, +256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,113.0484,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,113.0484,1,1.0,14.02,12517, +256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,171.0952,_ZN5aiter47fmoe_bf16_blockscaleBf16_g1u1_vs_ps_silu_32x256E,0.0%,0.0,,0.0%,171.0952,1,1.0,18.53,8273, +256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,221.0328,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x256E,0.0%,0.0,,0.0%,221.0328,1,1.0,28.69,6407, +256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,245.7126,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x256E,0.0%,0.0,,0.0%,245.7126,1,1.0,51.62,5769, +256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,185.6977,flydsl_moe1_afp8_wfp8_bf16_t16x256x256_qbs_kb7,0.4%,94.5169,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_atomic,0.5%,280.2146,0,0.0,90.53,5068, +256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,193.0576,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7_w3,0.3%,108.6767,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_atomic,0.5%,301.7343,0,0.0,168.14,4725, +256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,223.7484,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs_kb4,0.4%,148.6545,flydsl_moe2_afp8_wfp8_bf16_t64x128x128_qbs_atomic_w3,0.5%,372.4029,0,0.0,272.47,3858, +256,2048,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,463.279,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,463.279,1,0.0,438.05,3148, +256,4096,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,690.7719,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,690.7719,1,0.0,587.57,2175, +256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1315.7966,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1315.7966,1,0.0,616.93,1209, +256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2361.4769,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2361.4769,1,0.0,687.49,748, +256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4675.1629,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4675.1629,1,0.0,694.52,453, +256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,278.8859,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,141.0514,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,0.0,419,26.85,6715.17,0, +256,1,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,31.2962,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7_w4,0.0%,14.2286,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_reduce,0.3%,45.5248,0,0.0,4.35,62155, +256,2,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,40.3403,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7,0.4%,18.6088,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_reduce_w3,0.3%,58.9491,0,0.0,6.72,48001, +256,4,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,76.1647,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,76.1647,1,1.0,10.41,37151, +256,8,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,127.4625,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,127.4625,1,1.0,12.44,22200, +256,16,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,213.9559,_ZN5aiter47fmoe_bf16_blockscaleBf16_g1u1_vs_ps_silu_32x256E,0.0%,0.0,,0.0%,213.9559,1,1.0,14.82,13226, +256,32,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,222.2612,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs,0.4%,113.3301,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_reduce_w3,0.3%,335.5913,0,0.0,18.9,8433, +256,64,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,282.9431,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs,0.4%,143.6623,flydsl_moe2_afp8_wfp8_bf16_t32x256x128_qbs_reduce,0.3%,426.6054,0,0.0,29.73,6636, +256,128,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,312.7627,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs,0.3%,164.1691,flydsl_moe2_afp8_wfp8_bf16_t16x256x128_qbs_reduce_w3,0.3%,476.9318,0,0.0,53.19,5938, +256,256,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,342.7804,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7,0.4%,173.0648,flydsl_moe2_afp8_wfp8_bf16_t16x256x128_qbs_atomic_w3,0.7%,515.8452,0,0.0,98.35,5496, +256,512,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,362.0746,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7_w3,0.4%,184.2013,flydsl_moe2_afp8_wfp8_bf16_t32x256x128_qbs_atomic,0.7%,546.2759,0,0.0,185.75,5199, +256,1024,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,409.8416,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,221.4306,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_atomic,0.7%,631.2722,0,0.0,321.47,4517, +256,2048,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,900.7707,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,900.7707,1,0.0,450.59,3190, +256,4096,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,757.2850999999999,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,566.2038,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_atomic,0.6%,1323.4889,0,0.0,613.34,2204, +256,8192,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1154.1463,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,1013.2763,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_atomic,0.6%,2167.4226,0,0.0,749.05,1386, +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,24.748,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7,0.0%,12.3124,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_atomic_w3,0.5%,37.0604,0,0.0,2.38,38027, +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,29.2287,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7,0.3%,13.9821,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_reduce,0.3%,43.2108,0,0.0,4.08,32615, +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,36.4604,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs_kb7,0.4%,18.6138,flydsl_moe2_afp8_wfp8_bf16_t32x256x256_qbs_reduce,0.2%,55.0742,0,0.0,6.4,25590, +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.9075,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,71.9075,1,1.0,9.8,19600, +256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,106.8293,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,106.8293,1,1.0,13.19,13195, +256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,158.3854,_ZN5aiter47fmoe_bf16_blockscaleBf16_g1u1_vs_ps_silu_32x256E,0.0%,0.0,,0.0%,158.3854,1,1.0,17.8,8902, +256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,215.1422,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x256E,0.0%,0.0,,0.0%,215.1422,1,1.0,26.2,6556, +256,128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,242.0011,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x256E,0.0%,0.0,,0.0%,242.0011,1,1.0,46.59,5834, +256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,252.2485,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x256E,0.0%,0.0,,0.0%,252.2485,1,1.0,89.39,5608, +256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,265.50480000000005,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,,0.0%,265.5048,1,0.0,169.85,5349, +256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,294.4554,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,294.4554,1,0.0,306.31,4860, +256,2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,472.7997,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,472.7997,1,0.0,381.53,3073, +256,4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,688.9341000000001,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,688.9341,1,0.0,523.67,2173, +256,8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1149.1964,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1149.1964,1,0.0,627.88,1379, +256,16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2130.5055,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,2130.5055,1,0.0,677.36,826, +256,32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,4176.8963,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,4176.8963,1,0.0,691.0,506, +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,29.8047,flydsl_moe1_afp8_wfp8_bf16_t32x128x256_qbs_kb7,0.0%,13.8895,flydsl_moe2_afp8_wfp8_bf16_t32x256x128_qbs_reduce_w3,0.4%,43.6942,0,0.0,4.03,64507, +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,39.4842,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs_kb7_w3,0.4%,18.166,flydsl_moe2_afp8_wfp8_bf16_t16x128x256_qbs_reduce_w3,0.4%,57.6502,0,0.0,6.11,48891, +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.308,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,71.308,1,1.0,9.88,39527, +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,121.0862,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,121.0862,1,1.0,11.64,23278, +256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,203.3098,_ZN5aiter47fmoe_bf16_blockscaleBf16_g1u1_vs_ps_silu_32x256E,0.0%,0.0,,0.0%,203.3098,1,1.0,13.86,13865, +256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,311.7021,_ZN5aiter48fmoe_bf16_blockscaleBf16_g1u1_vs_pf2_silu_16x128E,0.0%,0.0,,0.0%,311.7021,1,1.0,18.09,9044, +256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,281.3667,flydsl_moe1_afp8_wfp8_bf16_t16x256x128_qbs,0.4%,140.6107,flydsl_moe2_afp8_wfp8_bf16_t16x256x256_qbs_reduce,0.3%,421.9774,0,0.0,26.72,6682, +256,128,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,315.9108,flydsl_moe1_afp8_wfp8_bf16_t32x128x256_qbs,0.4%,163.0024,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_atomic,0.6%,478.9132,0,0.0,47.08,5891, +256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,319.1568,flydsl_moe1_afp8_wfp8_bf16_t16x128x256_qbs,0.4%,165.2337,flydsl_moe2_afp8_wfp8_bf16_t16x256x256_qbs_atomic,0.6%,484.3905,0,0.0,93.1,5830, +256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,331.7527,flydsl_moe1_afp8_wfp8_bf16_t32x128x128_qbs,0.4%,182.5476,flydsl_moe2_afp8_wfp8_bf16_t32x128x256_qbs_atomic_w3,0.6%,514.3003,0,0.0,175.37,5501, +256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,559.2398999999999,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,559.2399,1,0.0,322.56,5079, +256,2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,778.0739,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,778.0739,1,0.0,463.68,3679, +256,4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1226.1146,_ZN5aiter46fmoe_bf16_blockscaleFp8_g1u1_vs_ps_silu_64x256E,0.0%,0.0,,0.0%,1226.1146,1,0.0,588.49,2370, +256,8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1049.4375,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,922.8841,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_atomic,0.6%,1972.3216,0,0.0,731.68,1518, +256,16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,1857.8158,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,1769.614,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_reduce,0.3%,3627.4298,0,0.0,795.66,874, +256,32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,3487.464,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,3416.5307,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_reduce,0.3%,6903.9947,0,0.0,836.1,510, +256,16384,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,2048.6477,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,2018.3878,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_atomic,0.6%,4067.0355,0,0.0,798.37,782, +256,32768,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,3829.0652,flydsl_moe1_afp8_wfp8_bf16_t64x128x128_qbs,0.4%,3922.4713,flydsl_moe2_afp8_wfp8_bf16_t64x256x128_qbs_reduce,0.3%,7751.5365,0,0.0,837.77,455, diff --git a/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_ds_v3.csv b/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_ds_v3.csv index 11068aa4f5..0b662e08ea 100644 --- a/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_ds_v3.csv +++ b/aiter/configs/model_configs/a8w8_blockscale_untuned_fmoe_ds_v3.csv @@ -1,4 +1,36 @@ token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +128,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2048,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4096,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8192,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16384,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32768,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +128,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2048,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4096,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8192,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16384,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32768,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 @@ -15,3 +47,19 @@ token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type, 8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +1,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +64,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +128,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +256,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +512,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +1024,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +2048,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +4096,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +8192,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +16384,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 +32768,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0 diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index f763b37ab3..04f1e25241 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -752,6 +752,7 @@ def _flydsl_stage1_wrapper( a_scale_one=_a_scale_one, xcd_swizzle=parsed.get("xcd_swizzle", 0), swiglu_limit=swiglu_limit, + quant_type=parsed.get("quant_type", ""), ) @@ -798,6 +799,7 @@ def _flydsl_stage2_wrapper( persist=parsed.get("persist", None), bias=bias2, xcd_swizzle=parsed.get("xcd_swizzle", 0), + quant_type=parsed.get("quant_type", ""), ) @@ -1531,7 +1533,10 @@ def fused_moe_2stages( num_rows=num_local_tokens, ) elif hidden_states.dtype != q_dtype_a: - if quant_type == QuantType.per_1x128 and metadata.stage1.func is asm_stage1: + if quant_type == QuantType.per_1x128 and metadata.stage1.func in ( + asm_stage1, + _flydsl_stage1_wrapper, + ): quant_func = functools.partial(quant_func, transpose_scale=True) a1, a1_scale = quant_func( hidden_states, diff --git a/aiter/ops/flydsl/kernels/moe_blockscale_2stage.py b/aiter/ops/flydsl/kernels/moe_blockscale_2stage.py new file mode 100644 index 0000000000..52ce3f0d86 --- /dev/null +++ b/aiter/ops/flydsl/kernels/moe_blockscale_2stage.py @@ -0,0 +1,2990 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 FlyDSL Project Contributors + +"""MoE Blockscale GEMM stage1/stage2 (FlyDSL MFMA FP8). + +Per-block scaling (ScaleBlockM=1, ScaleBlockN=128, ScaleBlockK=128). +FP8-only, g1u1 (gate+up with SiLU). + +Based on moe_gemm_2stage.py with blockscale compute_tile pattern +from blockscale_preshuffle_gemm.py. +""" + +import functools +import logging +import os +from contextlib import contextmanager + +import flydsl.compiler as flyc +import flydsl.expr as fx +from flydsl._mlir import ir +from flydsl._mlir.dialects import llvm, scf +from flydsl._mlir.dialects import math as math_dialect +from flydsl.compiler.kernel_function import CompilationContext +from flydsl.expr import arith, buffer_ops, const_expr, gpu, range_constexpr, rocdl, vector +from flydsl.expr.arith import ArithValue +from flydsl.expr.typing import T +from flydsl.runtime.device import get_rocm_arch as get_hip_arch +from flydsl.utils.smem_allocator import SmemAllocator, SmemPtr +from .mfma_epilogues import c_shuffle_epilog, default_epilog, mfma_epilog +from .mfma_preshuffle_pipeline import ( + buffer_copy_gmem16_dwordx4, + crd2idx, + lds_store_4b_xor16, + lds_store_8b_xor16, + lds_store_16b_xor16, + load_b_pack_k32, + make_preshuffle_b_layout, + swizzle_xor16, + tile_chunk_coord_i32, +) + + +@contextmanager +def _if_then(if_op): + """Compat helper for SCF IfOp then-region across old/new Python APIs.""" + with ir.InsertionPoint(if_op.then_block): + try: + yield if_op.then_block + finally: + blk = if_op.then_block + if (not blk.operations) or not isinstance(blk.operations[-1], scf.YieldOp): + scf.YieldOp([]) + + +@contextmanager +def _if_else(if_op): + """Compat helper for SCF IfOp else-region across old/new Python APIs.""" + if getattr(if_op, "else_block", None) is None: + raise RuntimeError("IfOp has no else block") + with ir.InsertionPoint(if_op.else_block): + try: + yield if_op.else_block + finally: + blk = if_op.else_block + if (not blk.operations) or not isinstance(blk.operations[-1], scf.YieldOp): + scf.YieldOp([]) + + +@functools.lru_cache(maxsize=1024) +def compile_moe_blockscale_gemm1( + *, + model_dim: int, + inter_dim: int, + experts: int, + topk: int, + tile_m: int, + tile_n: int, + tile_k: int, + doweight_stage1: bool, + scale_block_k: int = 128, + out_dtype: str = "f16", + use_cshuffle_epilog: bool | None = None, + waves_per_eu: int | None = None, + k_batch: int = 1, +): + """Compile stage1 kernel (`moe_gemm1`) and return the compiled executable. + + Blockscale is FP8-only (ScaleBlockM=1, ScaleBlockN=128, ScaleBlockK=128). + + k_batch: Split-K factor. When >1, K is partitioned across k_batch CTAs that + atomically accumulate gate/up partials into a `[tokens, topk, 2*inter_dim]` + f32 buffer (no in-kernel silu); caller must pre-zero this buffer and call + `silu_and_mul` externally to reduce gate/up -> `[tokens, topk, inter_dim]`. + """ + + gpu_arch = get_hip_arch() + _is_gfx950 = str(gpu_arch).startswith("gfx95") + allocator = SmemAllocator(None, arch=gpu_arch) + _state = {} + + in_dtype = "fp8" # blockscale is FP8-only + is_f16 = in_dtype == "fp16" + elem_bytes = 2 if is_f16 else 1 + if out_dtype not in ("f16", "bf16"): + raise ValueError(f"out_dtype must be 'f16' or 'bf16', got {out_dtype!r}") + # NOTE: don't materialize MLIR types outside an active MLIR Context. + out_mlir = lambda: (lambda ty: ty() if callable(ty) else ty)(T.f16 if out_dtype == "f16" else T.bf16) + tile_k_bytes = int(tile_k) * int(elem_bytes) + # K64-byte micro-step: always 64 bytes per `ku`. For fp16 this is 32 elements. + if (tile_k_bytes % 64) != 0: + raise ValueError( + f"tile_k_bytes must be divisible by 64, got tile_k_bytes={tile_k_bytes} " + f"(tile_k={tile_k}, elem_bytes={elem_bytes})" + ) + is_int4 = in_dtype == "int4" + # INT4 here means W4A8: X is int8, W is packed int4 and unpacked to int8 in-kernel. + is_int8 = (in_dtype == "int8") or is_int4 + x_is_token_slot = in_dtype == "int8smooth" + # "int8smooth" still uses int8 MFMA, but X/scale_x are provided per (token,slot). + is_int8 = is_int8 or x_is_token_slot + + # Blockscale compile-time constants (K=model_dim for stage1) + if model_dim % scale_block_k != 0: + raise ValueError(f"model_dim ({model_dim}) must be divisible by scale_block_k ({scale_block_k})") + if (2 * inter_dim) % 128 != 0: + raise ValueError(f"2*inter_dim ({2 * inter_dim}) must be divisible by 128 (ScaleBlockN)") + sb_per_tile_s1 = tile_k // scale_block_k # scale blocks per tile (in K dim) + ku_per_sb_s1 = scale_block_k // 64 # K64-steps per scale block = 2 + nblk_k_w1 = model_dim // scale_block_k # K-blocks in W1 (=scale_k) + (2 * inter_dim) // 128 # N-blocks in W1 (ScaleBlockN=128) + + # Split-K validation (mirrors moe_gemm_2stage.py). + _is_splitk = k_batch > 1 + if _is_splitk: + if model_dim % k_batch != 0: + raise ValueError(f"model_dim={model_dim} not divisible by k_batch={k_batch}") + _k_per_batch = model_dim // k_batch + if _k_per_batch % tile_k != 0: + raise ValueError( + f"K_per_batch={_k_per_batch} not divisible by tile_k={tile_k} " + f"(model_dim={model_dim}, k_batch={k_batch})" + ) + # The ping-pong K-loop requires an even number of K tiles (>=4). + _k_tiles = _k_per_batch // tile_k + if not (_k_tiles >= 4 and _k_tiles % 2 == 0): + raise ValueError( + f"K_per_batch/tile_k={_k_tiles} must be even and >=4 for the ping-pong pipeline. " + f"Try a different k_batch (model_dim={model_dim}, tile_k={tile_k})." + ) + # Split-K requires the same scale-block alignment per batch. + if _k_per_batch % scale_block_k != 0: + raise ValueError( + f"K_per_batch={_k_per_batch} must be divisible by scale_block_k={scale_block_k}" + ) + else: + _k_per_batch = model_dim + + mfma_i32_k32 = None + if is_int8: + mfma_i32_k32 = getattr(rocdl, "mfma_i32_16x16x32i8", None) or getattr(rocdl, "mfma_i32_16x16x32_i8", None) + if mfma_i32_k32 is None: + raise AttributeError( + "INT8 K32 MFMA op not found: expected `rocdl.mfma_i32_16x16x32i8` (or `rocdl.mfma_i32_16x16x32_i8`)." + ) + + ir.ShapedType.get_dynamic_size() + # W is packed int4 for W4A8: 2 values per byte. + (experts * (2 * inter_dim) * model_dim) // 2 if is_int4 else (experts * (2 * inter_dim) * model_dim) + + total_threads = 256 + bytes_x_per_tile = int(tile_m) * int(tile_k) * int(elem_bytes) + if bytes_x_per_tile % total_threads != 0: + raise ValueError( + "tile_m*tile_k*elem_bytes must be divisible by " + f"{total_threads}: tile_m={tile_m}, tile_k={tile_k}, elem_bytes={elem_bytes}" + ) + bytes_per_thread_x = bytes_x_per_tile // total_threads + # Keep MoE stage1 X gmem->LDS pipeline consistent with the optimized GEMM kernel: + # split into <=16B pieces and use `fly.copy(load-only)` for buffer_load_dwordx4. + # (Compute the split lens inside the kernel so the code matches GEMM structure.) + + # LDS128 mode (same idea as test_preshuffle_gemm.py): + # - LDS stride == tile_k (no extra padding) + XOR16 swizzle + # - Use ds_{read,write}_b128 (16B) and extract 8B halves for MFMA steps + _ck_lds128 = os.environ.get("FLYDSL_CK_LDS128", "1") in ("1", "true", "True", "YES", "yes") + pad_k = 0 if _ck_lds128 else 8 + lds_stride = tile_k + pad_k + if use_cshuffle_epilog is None: + use_cshuffle_epilog = os.environ.get("FLYDSL_MOE_STAGE1_CSHUFFLE", "1") in ("1", "true", "True", "YES", "yes") + use_cshuffle_epilog = bool(use_cshuffle_epilog) + # Split-K uses bf16 atomic CShuffle regardless of out_dtype (post-kernel silu_and_mul). + # Non-splitk cshuffle currently writes f16 to LDS, so silently fall back to the direct + # epilogue when out_dtype != f16 (rather than raising — caller code may default to bf16). + if _is_splitk: + use_cshuffle_epilog = True + elif out_dtype != "f16" and use_cshuffle_epilog: + use_cshuffle_epilog = False + + epilog_tag = "cshuffle" if use_cshuffle_epilog else "direct" + # IMPORTANT: module name participates in FlyDSL's compile cache key. + # Keep an explicit ABI tag so signature changes can't accidentally reuse an old binary. + _wpe_tag = f"_wpe{waves_per_eu}" if waves_per_eu is not None else "" + _split_k_tag = f"_splitk{k_batch}" if _is_splitk else "" + module_name = ( + f"mfma_moe1_bs_{in_dtype}_{out_dtype}_{epilog_tag}" + f"_t{tile_m}x{tile_n}x{tile_k}{_wpe_tag}{_split_k_tag}" + f"_abi9" # split-K support + ).replace("-", "_") + + # ── LDS sizing (pure Python; no MLIR Context needed) ───────────────────── + _use_cshuffle_epilog = bool(use_cshuffle_epilog) + # Split-K writes bf16 partials into LDS before atomic-fadd; non-splitk writes f16. + # bf16 path lets us call silu_and_mul(post_out_bf16, post_input_bf16) directly + # post-kernel, matching the moe_gemm_2stage splitk convention. + lds_x_bytes = 2 * int(tile_m) * int(lds_stride) * int(elem_bytes) + lds_out_bytes = 2 * int(tile_m) * int(tile_n) if _use_cshuffle_epilog else 0 + lds_total_bytes = max(lds_x_bytes, lds_out_bytes) + lds_total_elems = lds_total_bytes if elem_bytes == 1 else (lds_total_bytes // 2) + + lds_alloc_bytes = int(lds_total_elems) * int(elem_bytes) + lds_alloc_offset = allocator._align(allocator.ptr, 16) + allocator.ptr = lds_alloc_offset + lds_alloc_bytes + + @flyc.kernel(name=module_name) + def moe_blockscale_gemm1( + arg_out: fx.Tensor, + arg_x: fx.Tensor, + arg_w: fx.Tensor, + arg_scale_x: fx.Tensor, + arg_scale_w: fx.Tensor, + arg_sorted_token_ids: fx.Tensor, + arg_expert_ids: fx.Tensor, + arg_sorted_weights: fx.Tensor, + arg_max_token_ids: fx.Tensor, + i32_tokens_in: fx.Int32, + i32_inter_in: fx.Int32, + i32_k_in: fx.Int32, + i32_size_expert_ids_in: fx.Int32, + ): + tokens_in = arith.index_cast(T.index, i32_tokens_in) + inter_in = arith.index_cast(T.index, i32_inter_in) + k_in = arith.index_cast(T.index, i32_k_in) + size_expert_ids_in = arith.index_cast(T.index, i32_size_expert_ids_in) + tokens_i32_v = i32_tokens_in + k_i32_v = i32_k_in + x_elem = T.f16 if is_f16 else (T.i8 if is_int8 else T.f8) + # For int4, weights are stored as packed bytes (i8) and unpacked to i8 packs. + w_elem = T.f16 if is_f16 else (T.i8 if is_int8 else T.f8) + vec16_elems = 16 if elem_bytes == 1 else 8 + vec8_elems = 8 if elem_bytes == 1 else 4 + vec8_x = T.vec(vec8_elems, x_elem) + vec16_x = T.vec(vec16_elems, x_elem) + + def silu(x): + # device fast path: + # emu = exp(-x) ~= exp2(log2e * (-x)) -> v_exp_f32 + # sig = rcp(1 + emu) -> v_rcp_f32 + # y = x * sig + # + # Using llvm.amdgcn intrinsics prevents lowering to the div_scale/div_fixup + # sequences that introduce extra compares/cndmasks. + t = x * (-1.4426950408889634) # -log2(e) + emu = rocdl.exp2(T.f32, t) + den = 1.0 + emu + sig = rocdl.rcp(T.f32, den) + return x * sig + + acc_init = arith.constant_vector(0, T.i32x4) if is_int8 else arith.constant_vector(0.0, T.f32x4) + + # Layouts + fx.make_layout((tokens_i32_v, k_i32_v), stride=(k_i32_v, 1)) + + # B preshuffle layout: match GEMM test helper exactly. + c_n_total = arith.index(experts * (2 * inter_dim)) + kpack_bytes = 8 if is_int4 else 16 + b_layout = make_preshuffle_b_layout( + arith, c_n=c_n_total, c_k=k_in, kpack_bytes=kpack_bytes, elem_bytes=elem_bytes + ) + layout_b = b_layout.layout_b + (k_in * arith.index(int(elem_bytes))) // fx.Index(64) + + shape_lds = fx.make_shape(tile_m, tile_k) + stride_lds = fx.make_stride(lds_stride, 1) + layout_lds = fx.make_layout(shape_lds, stride_lds) + + tx = gpu.thread_id("x") + # Align with Aiter launch mapping (NSwizzle==false): + # - blockIdx.x -> N dimension (tile along inter_dim) + # - blockIdx.y -> expert-block id / M dimension (tile along sorted M) + # - blockIdx.z -> K-batch id (split-K) + by = gpu.block_id("x") # tile along inter_dim + bx = gpu.block_id("y") # tile along sorted M + + # Split-K: bz selects which K slice this CTA processes. + if const_expr(_is_splitk): + bz = gpu.block_id("z") + k_base_idx = bz * arith.index(_k_per_batch) + else: + k_base_idx = arith.index(0) + + # Block validity: compute as early as possible so invalid blocks skip all buffer-resource + # setup, LDS pointer math, and gmem prefetch work. + bx_m = bx * fx.Index(tile_m) + maxids_rsrc = buffer_ops.create_buffer_resource( + arg_max_token_ids, max_size=False, num_records_bytes=fx.Index(4) + ) + max_token_id_i32 = buffer_ops.buffer_load(maxids_rsrc, fx.Index(0), vec_width=1, dtype=T.i32) + bx_m_i32 = arith.index_cast(T.i32, bx_m) + blk_valid = arith.cmpi(arith.CmpIPredicate.ult, bx_m_i32, max_token_id_i32) + # Common constants/atoms (hoisted): keep IR small like GEMM. + # XOR16 swizzle parameter (in bytes; constant, power-of-two in our configs). + k_blocks16 = arith.index(tile_k_bytes // 16) + layout_tx_wave_lane = fx.make_layout((4, 64), stride=(64, 1)) + layout_lane16 = fx.make_layout((4, 16), stride=(16, 1)) + + # Everything below is gated by `blk_valid` to avoid doing buffer-resource setup and + # gmem work for padding blocks. + _if_blk = scf.IfOp(blk_valid) + with _if_then(_if_blk): + base_ptr = allocator.get_base() + lds_x_ptr = SmemPtr( + base_ptr, + lds_alloc_offset, + (T.f16 if is_f16 else (T.i8 if is_int8 else T.f8)), + shape=(lds_total_elems,), + ) + lds_x = lds_x_ptr.get() + # Alias LDS bytes as f16 or bf16 (matches out_dtype) for CShuffle epilogue. + # Both are 2-byte elements; same LDS size. Split-K partials use the same + # dtype as out so the post-kernel silu_and_mul reduces in-place. + _lds_out_elem_type = T.bf16 if out_dtype == "bf16" else T.f16 + lds_out = ( + SmemPtr(base_ptr, lds_x_ptr.byte_offset, _lds_out_elem_type, shape=(tile_m * tile_n,)).get() + if _use_cshuffle_epilog + else None + ) + + # Buffer resources: for dynamic memrefs, provide `num_records_bytes` explicitly so + # hardware OOB behavior is stable (otherwise it falls back to a large max size). + c_topk = fx.Index(topk) + + # X: [tokens, k] bytes = tokens*k*elem_bytes + x_rows = tokens_in * (c_topk if x_is_token_slot else fx.Index(1)) + x_nbytes_idx = x_rows * k_in * arith.index(int(elem_bytes)) + x_rsrc = buffer_ops.create_buffer_resource( + arg_x, max_size=False, num_records_bytes=arith.index_cast(T.i64, x_nbytes_idx) + ) + + w_rsrc = buffer_ops.create_buffer_resource(arg_w, max_size=False) + + # OUT: + # - non-splitk: [tokens, topk, inter] f16/bf16 (post-silu, post-doweight) + # - splitk: [tokens, topk, 2*inter] bf16 (gate+up partials, pre-silu; caller does silu_and_mul) + out_elem_bytes = 2 # 2 bytes for both f16/bf16 (and bf16 splitk partials) + if const_expr(_is_splitk): + out_nbytes_idx = tokens_in * c_topk * inter_in * fx.Index(2) * fx.Index(out_elem_bytes) + else: + out_nbytes_idx = tokens_in * c_topk * inter_in * fx.Index(out_elem_bytes) + out_rsrc = buffer_ops.create_buffer_resource( + arg_out, max_size=False, num_records_bytes=arith.index_cast(T.i64, out_nbytes_idx) + ) + + # fp16 path ignores scales completely (implicit scale=1.0). + x_load_bytes = 16 + + sx_rsrc = -1 + sw_rsrc = -1 + if const_expr(not is_f16): + # scale_x: [nblk_k_w1, tokens] f32 transposed -> total = nblk_k_w1 * tokens + sx_nbytes_idx = arith.index(nblk_k_w1) * tokens_in * fx.Index(4) + sx_rsrc = buffer_ops.create_buffer_resource( + arg_scale_x, max_size=False, num_records_bytes=arith.index_cast(T.i64, sx_nbytes_idx) + ) + sw_rsrc = buffer_ops.create_buffer_resource(arg_scale_w, max_size=False) + + sorted_rsrc = buffer_ops.create_buffer_resource(arg_sorted_token_ids, max_size=False) + sorted_w_rsrc = buffer_ops.create_buffer_resource(arg_sorted_weights, max_size=False) + + # expert ids: [blocks] i32 -> bytes = size_expert_ids_in*4 + expert_rsrc = buffer_ops.create_buffer_resource( + arg_expert_ids, + max_size=False, + num_records_bytes=arith.index_cast(T.i64, size_expert_ids_in * fx.Index(4)), + ) + + # Expert id for this M tile (keep address math in `index`) + expert_i32 = buffer_ops.buffer_load(expert_rsrc, bx, vec_width=1, dtype=T.i32) + expert_idx = arith.index_cast(T.index, expert_i32) + inter2_idx = arith.index(2 * inter_dim) + expert_off_idx = expert_idx * inter2_idx # index + + # ---- X gmem->reg prefetch (match preshuffle GEMM mapping) ---- + # Prefer 16B buffer-load (dwordx4). If the per-thread byte count isn't divisible by + # 16, fall back to 8B (dwordx2) or 4B (dword) loads. For fp16 we require 16B. + x_load_bytes = 16 + if const_expr(is_f16): + if const_expr(bytes_per_thread_x % 16 != 0): + raise ValueError(f"[fp16] bytes_per_thread_x ({bytes_per_thread_x}) must be divisible by 16") + x_load_bytes = 16 + else: + if const_expr(bytes_per_thread_x % 16 == 0): + x_load_bytes = 16 + elif const_expr(bytes_per_thread_x % 8 == 0): + x_load_bytes = 8 + elif const_expr(bytes_per_thread_x % 4 == 0): + x_load_bytes = 4 + else: + raise ValueError( + f"bytes_per_thread_x ({bytes_per_thread_x}) must be divisible by 4 to use the dword-indexed load mapping." + ) + num_x_loads = bytes_per_thread_x // x_load_bytes + chunk_i32 = x_load_bytes // 4 # dwords per chunk (1/2/4) + + c_k_div4 = (k_in * arith.index(int(elem_bytes))) // fx.Index(4) + c_k_div4_i32 = arith.index_cast(T.i32, c_k_div4) + fx.make_layout((tokens_i32_v, c_k_div4_i32), stride=(c_k_div4_i32, 1)) + tile_k_dwords = (int(tile_k) * int(elem_bytes)) // 4 + layout_x_tile_div4 = fx.make_layout((tile_m, tile_k_dwords), stride=(tile_k_dwords, 1)) + c_chunk_i32 = fx.Index(chunk_i32) + tx_i32_base = tx * c_chunk_i32 + mask24 = fx.Int32(0xFFFFFF) + # Keep i32 constants available for epilogue index math. + tokens_i32 = arith.index_cast(T.i32, tokens_in) + topk_i32 = fx.Int32(topk) + + def x_tile_chunk_coord_i32(i: int): + return tile_chunk_coord_i32( + arith, + tx_i32_base=tx_i32_base, + i=i, + total_threads=total_threads, + layout_tile_div4=layout_x_tile_div4, + chunk_i32=chunk_i32, + ) + + # decode token once (per thread's M-slice) and build a base row offset. + x_row_base_div4 = [] + x_col_local_i32 = [] + x_row_local = [] + for i in range_constexpr(num_x_loads): + row_local, col_local_i32 = x_tile_chunk_coord_i32(i) + x_row_local.append(row_local) + x_col_local_i32.append(col_local_i32) + + sorted_row_i = bx_m + row_local + # NOTE: rows beyond `num_valid_ids` can contain garbage (within the allocated + # buffer). That's OK as long as we never use an out-of-range token id to index X. + fused_i = buffer_ops.buffer_load(sorted_rsrc, sorted_row_i, vec_width=1, dtype=T.i32) + t_raw = fused_i & mask24 + # NOTE: aiter moe_sorting uses sentinel token_id == tokens for padding. + # Do NOT rely on buffer OOB semantics for X loads; explicitly mask to a safe row. + t_valid_i32 = arith.cmpi(arith.CmpIPredicate.ult, t_raw, tokens_i32) + if const_expr(x_is_token_slot): + s_raw = fused_i >> 24 + # X is indexed by token-slot in **slot-major** order: + # row_ts = slot * tokens + token + # This matches CK's moe_smoothquant output layout. + row_ts_i32 = s_raw * tokens_i32 + t_raw + row_ts_idx = arith.index_cast(T.index, row_ts_i32) + # Apply bounds check to token-slot index + row_ts_safe = t_valid_i32.select(row_ts_idx, fx.Index(0)) + x_row_base_div4.append(row_ts_safe * c_k_div4) + else: + t_idx = arith.index_cast(T.index, t_raw) + t_safe = t_valid_i32.select(t_idx, fx.Index(0)) + x_row_base_div4.append(t_safe * c_k_div4) + + T.vec(1, T.i32) + T.vec(2, T.i32) + vec4_x = T.vec(4, x_elem) + + def load_x(idx_i32, x_load_bytes_v): + """Load `x_load_bytes` bytes from X (gmem) into regs. + + For 16B, keep the fast dwordx4 path. For 8B/4B, use byte offsets. + """ + if const_expr(x_load_bytes_v == 16): + idx_elem = idx_i32 if elem_bytes == 1 else (idx_i32 * fx.Index(2)) + return buffer_copy_gmem16_dwordx4( + buffer_ops, + vector, + elem_type=x_elem, + idx_i32=idx_elem, + rsrc=x_rsrc, + vec_elems=vec16_elems, + elem_bytes=elem_bytes, + ) + if const_expr(x_load_bytes_v == 8): + return buffer_ops.buffer_load(x_rsrc, idx_i32, vec_width=2, dtype=T.i32) + return buffer_ops.buffer_load(x_rsrc, idx_i32, vec_width=1, dtype=T.i32) + + def load_x_tile(base_k, x_load_bytes_v): + """Prefetch the per-thread X tile portion (gmem -> regs) for a given K base (in elements).""" + base_k_div4 = (base_k * arith.index(int(elem_bytes))) // fx.Index(4) + parts = [] + for i in range_constexpr(num_x_loads): + idx_i32 = x_row_base_div4[i] + base_k_div4 + x_col_local_i32[i] + x_vec = load_x(idx_i32, x_load_bytes_v) + if const_expr(x_load_bytes_v == 16): + parts.append(vector.bitcast(T.i32x4, x_vec)) + elif const_expr(x_load_bytes_v == 8): + parts.append(x_vec) + else: + parts.append(x_vec) + return parts + + # tx -> wave/lane (GEMM-style decomposition). + coord_wl = fx.idx2crd(tx, layout_tx_wave_lane) + wave_id = fx.get(coord_wl, 0) + lane_id = fx.get(coord_wl, 1) + coord_l16 = fx.idx2crd(lane_id, layout_lane16) + lane_div_16 = fx.get(coord_l16, 0) + lane_mod_16 = fx.get(coord_l16, 1) + + # Match GEMM naming/pattern: row in LDS is lane_mod_16, and col base is lane_div_16*16. + row_a_lds = lane_mod_16 + a_kpack_elems = 16 // elem_bytes + col_offset_base = lane_div_16 * arith.index(int(a_kpack_elems)) + col_offset_base_bytes = ( + col_offset_base if elem_bytes == 1 else (col_offset_base * arith.index(int(elem_bytes))) + ) + + # Dynamic N tiling within block (same as existing kernels) + by_n = by * fx.Index(tile_n) + num_waves = 4 + n_per_wave = tile_n // num_waves + num_acc_n = n_per_wave // 16 + c_n_per_wave = fx.Index(n_per_wave) + wave_mod_4 = wave_id % fx.Index(4) + n_tile_base = wave_mod_4 * c_n_per_wave + + # Precompute n_blk/n_intra for gate and up rows (GEMM-style: idx2crd/get) + n_intra_gate = [] + n_blk_gate = [] + n_intra_up = [] + n_blk_up = [] + col_g_list = [] + inter_idx = fx.Index(inter_dim) + # layout for (row -> (blk,intra)) where intra is 0..15 + c_n0 = c_n_total // fx.Index(16) + c_n0_i32 = arith.index_cast(T.i32, c_n0) + layout_n_blk_intra = fx.make_layout((c_n0_i32, 16), stride=(16, 1)) + for ni in range_constexpr(num_acc_n): + offset = arith.index(ni * 16) + col_g = by_n + n_tile_base + col_g = col_g + offset + col_g = col_g + lane_mod_16 + col_g_list.append(col_g) + + row_gate = expert_off_idx + col_g + row_up = row_gate + inter_idx + + coord_gate = fx.idx2crd(row_gate, layout_n_blk_intra) + n_blk_gate.append(fx.get(coord_gate, 0)) + n_intra_gate.append(fx.get(coord_gate, 1)) + + coord_up = fx.idx2crd(row_up, layout_n_blk_intra) + n_blk_up.append(fx.get(coord_up, 0)) + n_intra_up.append(fx.get(coord_up, 1)) + + m_repeat = tile_m // 16 + k_unroll = tile_k_bytes // 64 # K64-byte micro-step (2x MFMA) + + # --- B Load Logic (K64) - shared layout with preshuffle GEMM --- + def load_b_pack(base_k, ki_step, ni, blk_list, intra_list): + return load_b_pack_k32( + buffer_ops, + arith, + vector, + arg_b=arg_w, + b_rsrc=w_rsrc, + layout_b=layout_b, + base_k=base_k, + ki_step=ki_step, + n_blk=blk_list[ni], + n_intra=intra_list[ni], + lane_div_16=lane_div_16, # 0..3 + elem_type=w_elem, + kpack_bytes=kpack_bytes, + elem_bytes=elem_bytes, + unpack_int4=is_int4, + ) + + def load_b_tile(base_k, blk_list, intra_list): + """Prefetch the entire per-thread B tile (gmem -> regs) for a given K base. + + Returns a list of length `k_unroll`, where each entry is a tuple: + (packs_half0[ni], packs_half1[ni]) for the K64 micro-step. + """ + b_tile = [] + for ku in range_constexpr(k_unroll): + packs0 = [] + packs1 = [] + for ni in range_constexpr(num_acc_n): + ki0 = (ku * 2) + 0 + ki1 = (ku * 2) + 1 + b0 = load_b_pack(base_k, ki0, ni, blk_list, intra_list) + b1 = load_b_pack(base_k, ki1, ni, blk_list, intra_list) + packs0.append(b0) + packs1.append(b1) + b_tile.append((packs0, packs1)) + return b_tile + + acc_gate = [arith.constant_vector(0.0, T.f32x4)] * (num_acc_n * m_repeat) + acc_up = [arith.constant_vector(0.0, T.f32x4)] * (num_acc_n * m_repeat) + + # ---- Pipeline helpers: store X tile to LDS with ping-pong base ---- + def store_x_tile_to_lds(vec_x_in_parts, lds_base, x_load_bytes_v): + for i in range_constexpr(num_x_loads): + row_local = x_row_local[i] + col_local_i32 = x_col_local_i32[i] + if const_expr(x_load_bytes_v == 16): + lds_store_16b_xor16( + arith, + vector, + lds_memref=lds_x, + vec16_ty=vec16_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x4=vec_x_in_parts[i], + elem_bytes=elem_bytes, + ) + elif const_expr(x_load_bytes_v == 8): + lds_store_8b_xor16( + arith, + vector, + lds_memref=lds_x, + vec8_ty=vec8_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x2=vec_x_in_parts[i], + ) + else: + lds_store_4b_xor16( + arith, + vector, + lds_memref=lds_x, + vec4_ty=vec4_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x1=vec_x_in_parts[i], + ) + + # --- A LDS load helper for K64 (load 16B once, extract 2x i64 halves) --- + def lds_load_packs_k64(curr_row_a_lds, col_base_bytes, lds_base): + col_base_swz_bytes = swizzle_xor16(curr_row_a_lds, col_base_bytes, k_blocks16) + col_base_swz = ( + col_base_swz_bytes if elem_bytes == 1 else (col_base_swz_bytes // arith.index(int(elem_bytes))) + ) + idx_a16 = crd2idx((curr_row_a_lds, col_base_swz), layout_lds) + idx_a16 = idx_a16 + lds_base + loaded_a16 = vector.load_op(vec16_x, lds_x, [idx_a16]) + a_i64x2 = vector.bitcast(T.i64x2, loaded_a16) + a0 = vector.extract(a_i64x2, static_position=[0], dynamic_position=[]) + a1 = vector.extract(a_i64x2, static_position=[1], dynamic_position=[]) + return a0, a1 + + # --- Blockscale pre-decode and helpers --- + c_scale_block_k = fx.Index(scale_block_k) + c_128 = fx.Index(128) + c_nblk_k_w1 = fx.Index(nblk_k_w1) + row_off_base = lane_div_16 * fx.Index(4) + + # Pre-decode sorted token IDs as i32 (constant across all K-tiles). + # OOB buffer loads return 0, so no validity masking needed for scale values. + _pre_t_safe_i32 = [] + for _mi in range_constexpr(m_repeat): + _mi_safe = [] + for _ii in range_constexpr(4): + _row_in_tile = arith.index(_mi * 16) + row_off_base + fx.Index(_ii) + _sorted_row = bx_m + _row_in_tile + _fused_pre = buffer_ops.buffer_load(sorted_rsrc, _sorted_row, vec_width=1, dtype=T.i32) + _t_id_pre = _fused_pre & mask24 + _t_valid_pre = arith.cmpi(arith.CmpIPredicate.ult, _t_id_pre, tokens_i32) + _t_safe_pre = _t_valid_pre.select(_t_id_pre, fx.Int32(0)) + _mi_safe.append(_t_safe_pre) + _pre_t_safe_i32.append(_mi_safe) + + # Pre-compute N-block indices for scale_w (constant per CTA) + _pre_n_block_gate = [] + _pre_n_block_up = [] + for _ni in range_constexpr(num_acc_n): + _col_base_ni_pre = by_n + n_tile_base + arith.index(_ni * 16) + _pre_n_block_gate.append((expert_off_idx + _col_base_ni_pre) // c_128) + _pre_n_block_up.append((expert_off_idx + inter_idx + _col_base_ni_pre) // c_128) + + def load_scales_s1(k_base): + all_combined = [] + for sb in range_constexpr(sb_per_tile_s1): + kb = k_base // c_scale_block_k + fx.Index(sb) + sa_base_offset = kb * tokens_in + + s_a_vecs = [] + sa_base_i32 = arith.index_cast(T.i32, sa_base_offset) + for mi in range_constexpr(m_repeat): + s_a_row = [] + for ii in range_constexpr(4): + t_safe_i32 = _pre_t_safe_i32[mi][ii] + sa_idx_i32 = sa_base_i32 + t_safe_i32 + sa_idx = arith.index_cast(T.index, sa_idx_i32) + s_a_val = buffer_ops.buffer_load(sx_rsrc, sa_idx, vec_width=1, dtype=T.f32) + s_a_row.append(s_a_val) + s_a_vecs.append(s_a_row) + + _sw_shared_n = n_per_wave <= 128 + s_w_gate_vals = [] + s_w_up_vals = [] + s_w_gate = fx.Float32(1.0) + s_w_up = fx.Float32(1.0) + for ni in range_constexpr(num_acc_n): + if const_expr(ni == 0 or not _sw_shared_n): + sw_gate_idx = _pre_n_block_gate[ni] * c_nblk_k_w1 + kb + s_w_gate = buffer_ops.buffer_load(sw_rsrc, sw_gate_idx, vec_width=1, dtype=T.f32) + sw_up_idx = _pre_n_block_up[ni] * c_nblk_k_w1 + kb + s_w_up = buffer_ops.buffer_load(sw_rsrc, sw_up_idx, vec_width=1, dtype=T.f32) + s_w_gate_vals.append(s_w_gate) + s_w_up_vals.append(s_w_up) + + s_a_vec4_list = [] + for mi in range_constexpr(m_repeat): + s_a_vec4_list.append(vector.from_elements(T.f32x4, s_a_vecs[mi])) + all_combined.append((s_a_vec4_list, s_w_gate_vals, s_w_up_vals)) + return all_combined + + def compute_tile_bs_s1( + acc_gate_in, acc_up_in, b_gate_tile_in, b_up_tile_in, lds_base, pre_scales, *, a0_prefetch=None + ): + current_gate = list(acc_gate_in) + current_up = list(acc_up_in) + mfma_res_ty = T.f32x4 + + if const_expr(_is_gfx950): + + def _pack128(x0, x1, x2, x3): + v4 = vector.from_elements(T.vec(4, T.i64), [x0, x1, x2, x3]) + return vector.bitcast(T.vec(8, T.i32), v4) + + for sb in range_constexpr(sb_per_tile_s1): + s_a_vec4_list, s_w_gate_vals, s_w_up_vals = pre_scales[sb] + ku0 = sb * ku_per_sb_s1 + ku1 = ku0 + 1 + bg0_p0, bg0_p1 = b_gate_tile_in[ku0] + bg1_p0, bg1_p1 = b_gate_tile_in[ku1] + bu0_p0, bu0_p1 = b_up_tile_in[ku0] + bu1_p0, bu1_p1 = b_up_tile_in[ku1] + col0 = col_offset_base_bytes + arith.index(ku0 * 64) + col1 = col_offset_base_bytes + arith.index(ku1 * 64) + for mi in range_constexpr(m_repeat): + curr_row = row_a_lds + arith.index(mi * 16) + a0 = arith.constant(0, type=T.i64) + a1 = arith.constant(0, type=T.i64) + if const_expr(a0_prefetch is not None and sb == 0 and mi == 0): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64(curr_row, col0, lds_base) + a2, a3 = lds_load_packs_k64(curr_row, col1, lds_base) + a128 = _pack128(a0, a1, a2, a3) + s_a_v4 = s_a_vec4_list[mi] + pending_gate_up = None + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + bg128 = _pack128(bg0_p0[ni], bg0_p1[ni], bg1_p0[ni], bg1_p1[ni]) + bu128 = _pack128(bu0_p0[ni], bu0_p1[ni], bu1_p0[ni], bu1_p1[ni]) + blk_g = rocdl.mfma_scale_f32_16x16x128_f8f6f4( + mfma_res_ty, [a128, bg128, acc_init, 0, 0, 0, 0x7F7F7F7F, 0, 0x7F7F7F7F] + ) + blk_u = rocdl.mfma_scale_f32_16x16x128_f8f6f4( + mfma_res_ty, [a128, bu128, acc_init, 0, 0, 0, 0x7F7F7F7F, 0, 0x7F7F7F7F] + ) + rocdl.sched_barrier(0) + if const_expr(pending_gate_up is not None): + prev_acc_idx, prev_blk_g, prev_blk_u, prev_ni = pending_gate_up + s_wg_bc = vector.broadcast(T.f32x4, s_w_gate_vals[prev_ni]) + s_wu_bc = vector.broadcast(T.f32x4, s_w_up_vals[prev_ni]) + scale_g = ArithValue(s_a_v4) * ArithValue(s_wg_bc) + scale_u = ArithValue(s_a_v4) * ArithValue(s_wu_bc) + current_gate[prev_acc_idx] = math_dialect.fma( + prev_blk_g, scale_g, current_gate[prev_acc_idx] + ) + current_up[prev_acc_idx] = math_dialect.fma( + prev_blk_u, scale_u, current_up[prev_acc_idx] + ) + pending_gate_up = (acc_idx, blk_g, blk_u, ni) + if const_expr(pending_gate_up is not None): + prev_acc_idx, prev_blk_g, prev_blk_u, prev_ni = pending_gate_up + s_wg_bc = vector.broadcast(T.f32x4, s_w_gate_vals[prev_ni]) + s_wu_bc = vector.broadcast(T.f32x4, s_w_up_vals[prev_ni]) + scale_g = ArithValue(s_a_v4) * ArithValue(s_wg_bc) + scale_u = ArithValue(s_a_v4) * ArithValue(s_wu_bc) + current_gate[prev_acc_idx] = math_dialect.fma( + prev_blk_g, scale_g, current_gate[prev_acc_idx] + ) + current_up[prev_acc_idx] = math_dialect.fma( + prev_blk_u, scale_u, current_up[prev_acc_idx] + ) + else: + mfma_fn = ( + mfma_i32_k32 + if const_expr(is_int8) + else (rocdl.mfma_f32_16x16x16f16 if is_f16 else rocdl.mfma_f32_16x16x32_fp8_fp8) + ) + + def _i64_to_v4f16(x_i64): + v1 = vector.from_elements(T.vec(1, T.i64), [x_i64]) + return vector.bitcast(T.f16x4, v1) + + def mfma_k64(acc_in, a0, a1, b0, b1): + if const_expr(is_f16): + a0v = _i64_to_v4f16(a0) + a1v = _i64_to_v4f16(a1) + b0v = _i64_to_v4f16(b0) + b1v = _i64_to_v4f16(b1) + acc_mid = mfma_fn(mfma_res_ty, [a0v, b0v, acc_in, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1v, b1v, acc_mid, 0, 0, 0]) + acc_mid = mfma_fn(mfma_res_ty, [a0, b0, acc_in, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1, b1, acc_mid, 0, 0, 0]) + + for sb in range_constexpr(sb_per_tile_s1): + s_a_vec4_list, s_w_gate_vals, s_w_up_vals = pre_scales[sb] + for mi in range_constexpr(m_repeat): + s_a_v4 = s_a_vec4_list[mi] + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + blk_g = acc_init + blk_u = acc_init + for ku_local in range_constexpr(ku_per_sb_s1): + ku = sb * ku_per_sb_s1 + ku_local + b_gate_packs0, b_gate_packs1 = b_gate_tile_in[ku] + b_up_packs0, b_up_packs1 = b_up_tile_in[ku] + ki64 = arith.index(ku * 64) + col_base = col_offset_base_bytes + ki64 + a0 = arith.constant(-1, type=T.i64) + a1 = arith.constant(-1, type=T.i64) + if const_expr( + (a0_prefetch is not None) and (sb == 0) and (ku_local == 0) and (mi == 0) + ): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64( + row_a_lds + arith.index(mi * 16), col_base, lds_base + ) + blk_g = mfma_k64(blk_g, a0, a1, b_gate_packs0[ni], b_gate_packs1[ni]) + blk_u = mfma_k64(blk_u, a0, a1, b_up_packs0[ni], b_up_packs1[ni]) + s_wg_bc = vector.broadcast(T.f32x4, s_w_gate_vals[ni]) + s_wu_bc = vector.broadcast(T.f32x4, s_w_up_vals[ni]) + scale_g = ArithValue(s_a_v4) * ArithValue(s_wg_bc) + scale_u = ArithValue(s_a_v4) * ArithValue(s_wu_bc) + current_gate[acc_idx] = math_dialect.fma(blk_g, scale_g, current_gate[acc_idx]) + current_up[acc_idx] = math_dialect.fma(blk_u, scale_u, current_up[acc_idx]) + return current_gate, current_up + + def compute_tile( + acc_gate_in, + acc_up_in, + b_gate_tile_in, + b_up_tile_in, + lds_base, + *, + prefetch_epilogue: bool = False, + a0_prefetch=None, + ): + gate_list = list(acc_gate_in) + up_list = list(acc_up_in) + mfma_res_ty = T.i32x4 if is_int8 else T.f32x4 + mfma_fn = ( + mfma_i32_k32 + if const_expr(is_int8) + else (rocdl.mfma_f32_16x16x16f16 if is_f16 else rocdl.mfma_f32_16x16x32_fp8_fp8) + ) + + # Optional: prefetch epilogue scales while we are about to run the last MFMA tile, + # matching the preshuffle GEMM pattern of overlapping scale loads with MFMA. + epilogue_pf = None + if const_expr(prefetch_epilogue): + expert_off_pf = expert_off_idx + sw_gate_pf = [] + sw_up_pf = [] + for ni in range_constexpr(num_acc_n): + col_g = col_g_list[ni] + row_gate_idx = expert_off_pf + col_g + row_up_idx = row_gate_idx + inter_idx + sw_gate_pf.append( + fx.Float32(1.0) + if const_expr(is_f16) + else buffer_ops.buffer_load(sw_rsrc, row_gate_idx, vec_width=1, dtype=T.f32) + ) + sw_up_pf.append( + fx.Float32(1.0) + if const_expr(is_f16) + else buffer_ops.buffer_load(sw_rsrc, row_up_idx, vec_width=1, dtype=T.f32) + ) + epilogue_pf = (sw_gate_pf, sw_up_pf) + + def _i64_to_v4f16(x_i64): + v1 = vector.from_elements(T.vec(1, T.i64), [x_i64]) + return vector.bitcast(T.f16x4, v1) + + def mfma_k64(acc_in, a0, a1, b0, b1): + if const_expr(is_f16): + a0v = _i64_to_v4f16(a0) + a1v = _i64_to_v4f16(a1) + b0v = _i64_to_v4f16(b0) + b1v = _i64_to_v4f16(b1) + acc_mid = mfma_fn(mfma_res_ty, [a0v, b0v, acc_in, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1v, b1v, acc_mid, 0, 0, 0]) + acc_mid = mfma_fn(mfma_res_ty, [a0, b0, acc_in, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1, b1, acc_mid, 0, 0, 0]) + + for ku in range_constexpr(k_unroll): + b_gate_packs0, b_gate_packs1 = b_gate_tile_in[ku] + b_up_packs0, b_up_packs1 = b_up_tile_in[ku] + ki64 = arith.index(ku * 64) + col_base = col_offset_base_bytes + ki64 + + for mi in range_constexpr(m_repeat): + mi_val = arith.index(mi * 16) + curr_row_a_lds = row_a_lds + mi_val + + a0 = arith.constant(-1, type=T.i64) + a1 = arith.constant(-1, type=T.i64) + if const_expr((a0_prefetch is not None) and (ku == 0) and (mi == 0)): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64(curr_row_a_lds, col_base, lds_base) + + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + gate_list[acc_idx] = mfma_k64( + gate_list[acc_idx], + a0, + a1, + b_gate_packs0[ni], + b_gate_packs1[ni], + ) + up_list[acc_idx] = mfma_k64( + up_list[acc_idx], + a0, + a1, + b_up_packs0[ni], + b_up_packs1[ni], + ) + return gate_list, up_list, epilogue_pf + + # ── scf.for loop helpers (acc-only loop state, CK-style) ────── + n_accs_half = m_repeat * num_acc_n + + # ---------------- 2-stage pipeline (ping-pong LDS + B tile prefetch) ---------------- + lds_tile_elems = arith.index(tile_m * lds_stride) + lds_base_cur = fx.Index(0) + lds_base_nxt = lds_tile_elems + + rocdl.sched_barrier(0) + + def hot_loop_scheduler(): + mfma_per_ku = m_repeat * num_acc_n * 2 * 2 + total_mfma = k_unroll * mfma_per_ku + rocdl.sched_group_barrier(rocdl.mask_dsrd, ku_per_sb_s1 * m_repeat, 0) + rocdl.sched_group_barrier(rocdl.mask_mfma, total_mfma, 1) + rocdl.sched_group_barrier(rocdl.mask_vmem_rd, num_x_loads, 2) + rocdl.sched_group_barrier(rocdl.mask_dswr, num_x_loads, 3) + rocdl.sched_barrier(0) + + def do_one_stage(acc_gate_in, acc_up_in, k_compute, k_next, lds_compute, lds_store): + """One pipeline stage: load next tile data, compute current tile, store X to LDS.""" + scale_fn = load_scales_s1 + pre_scales = scale_fn(k_compute) + x_regs_next = load_x_tile(k_next, x_load_bytes) + b_gate_cur = load_b_tile(k_compute, n_blk_gate, n_intra_gate) + b_up_cur = load_b_tile(k_compute, n_blk_up, n_intra_up) + + ag, au = compute_tile_bs_s1(acc_gate_in, acc_up_in, b_gate_cur, b_up_cur, lds_compute, pre_scales) + store_x_tile_to_lds(x_regs_next, lds_store, x_load_bytes) + hot_loop_scheduler() + gpu.barrier() + return ag, au + + # Prologue: prefetch tile0 X into LDS, sync. + k0 = k_base_idx + x_regs0 = load_x_tile(k0, x_load_bytes) + store_x_tile_to_lds(x_regs0, lds_base_cur, x_load_bytes) + gpu.barrier() + + lds_base_pong = lds_base_cur + lds_base_ping = lds_base_nxt + + c2_tile_k = arith.index(tile_k * 2) + c_tile_k = arith.index(tile_k) + total_tiles = int(_k_per_batch) // int(tile_k) + pair_iters = max((total_tiles - 2) // 2, 0) + c_k_main = pair_iters * tile_k * 2 + + init_state = list(acc_gate) + list(acc_up) + + for k_iv_rel, inner in range(0, c_k_main, tile_k * 2, init=init_state): + n = n_accs_half + acc_gate_in = list(inner[:n]) + acc_up_in = list(inner[n : 2 * n]) + + k_iv = k_base_idx + k_iv_rel + next_k1 = k_iv + c_tile_k + + acc_gate_s0, acc_up_s0 = do_one_stage( + acc_gate_in, acc_up_in, k_iv, next_k1, lds_base_pong, lds_base_ping + ) + + next_k2 = k_iv + c2_tile_k + + acc_gate_s1, acc_up_s1 = do_one_stage( + acc_gate_s0, acc_up_s0, next_k1, next_k2, lds_base_ping, lds_base_pong + ) + + results = yield list(acc_gate_s1) + list(acc_up_s1) + + n = n_accs_half + acc_gate = list(results[:n]) + acc_up = list(results[n : 2 * n]) + + # Tail: process the last two K tiles within this K-batch. + # For non-splitk: covers [k_in - 2*tile_k, k_in). + # For splitk: covers the last 2 tiles of this CTA's K slice. + _k_batch_end_idx = k_base_idx + arith.index(_k_per_batch) + k_tail0 = _k_batch_end_idx - c2_tile_k + k_tail1 = _k_batch_end_idx - c_tile_k + + acc_gate, acc_up = do_one_stage(acc_gate, acc_up, k_tail0, k_tail1, lds_base_pong, lds_base_ping) + + pre_scales_tail1 = load_scales_s1(k_tail1) + b_gate_last = load_b_tile(k_tail1, n_blk_gate, n_intra_gate) + b_up_last = load_b_tile(k_tail1, n_blk_up, n_intra_up) + acc_gate, acc_up = compute_tile_bs_s1( + acc_gate, acc_up, b_gate_last, b_up_last, lds_base_ping, pre_scales_tail1 + ) + + # Store epilogue to out[t, slot, inter] + tokens_i32_v = tokens_i32 + topk_i32_v = topk_i32 + inter_i32_v = fx.Int32(inter_dim) + mask24_i32 = fx.Int32(0xFFFFFF) + + # Blockscale: dequant already done in compute_tile_bs_s1, no sw/sx needed here. + + # Epilogue hoists to keep IR + Python build time small: + col_i32_list = [] + for ni in range_constexpr(num_acc_n): + col_i32_list.append(arith.index_cast(T.i32, col_g_list[ni])) + + lane_div_16 * fx.Index(4) + inter_i32_local = inter_i32_v + + if const_expr(use_cshuffle_epilog): + if const_expr(lds_out is None): + raise RuntimeError("CShuffle epilogue enabled but lds_out is not allocated/aliased.") + + # ─── Split-K epilogue: two-pass gate/up with atomic fadd ─── + # Output buffer layout: [tokens, topk, 2*inter] f16/bf16; gate to [..., 0:inter], up to [..., inter:2*inter]. + # Blockscale dequant already done in compute_tile_bs_s1, so no scale multiplication here. + # Caller must pre-zero `arg_out` and run `silu_and_mul` post this kernel. + # Atomic dtype matches out_dtype: + # - bf16: needs `buffer_atomic_pk_add_bf16` (gfx95+/gfx12), falls back to + # global atomicrmw fadd on gfx94X. + # - f16: uses `buffer_atomic_pk_add_f16` on all supported archs (gfx94+). + if const_expr(_is_splitk): + _splitk_out_is_bf16 = out_dtype == "bf16" + _splitk_out_elem_mlir = T.bf16 if _splitk_out_is_bf16 else T.f16 + out_base_idx = buffer_ops.extract_base_index(arg_out) + _split_k_out_row_stride = inter_dim * 2 * out_elem_bytes # bytes per (token,slot) row + _split_k_e_vec = 2 # vec2 atomic fadd (pk_add_bf16 / pk_add_f16) + + # bf16 only: gfx95+/gfx12 have buffer_atomic_pk_add_bf16; gfx94X must use global atomicrmw. + # f16: buffer_atomic_pk_add_f16 is available on all supported archs. + _has_buffer_atomic_bf16 = str(gpu_arch).startswith(("gfx95", "gfx12")) + _needs_global_atomic_bf16 = _splitk_out_is_bf16 and not _has_buffer_atomic_bf16 + + # Mutable slot: 0 for gate pass, inter_dim for up pass + _split_k_n_offset = [0] + _split_k_acc = [acc_gate] + + def write_row_to_lds_splitk( + *, + mi: int, + ii: int, + row_in_tile, + row, + row_base_lds, + col_base_local, + num_acc_n: int, + lds_out, + ): + """Write blockscale partials to LDS as out_dtype (no silu, no doweight; already dequant).""" + _acc = _split_k_acc[0] + for ni in range_constexpr(num_acc_n): + col_local = col_base_local + (ni * 16) + acc_idx = mi * num_acc_n + ni + v = vector.extract(_acc[acc_idx], static_position=[ii], dynamic_position=[]) + # v is already f32 with sx*sw applied in compute_tile_bs_s1. + v_out = arith.trunc_f(_splitk_out_elem_mlir, v) + lds_idx = row_base_lds + col_local + v1 = vector.from_elements(T.vec(1, _splitk_out_elem_mlir), [v_out]) + vector.store(v1, lds_out, [lds_idx], alignment=2) + + _splitk_zero_i32 = fx.Int32(0) + + def precompute_row_splitk(*, row_local, row): + # Returns (row_byte_ctx, t_ok); c_shuffle_epilog wraps the + # store_pair call with scf.IfOp(t_ok) for the second tuple element. + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + t2 = fused2 & mask24_i32 + s2 = fused2 >> 24 + t_ok = arith.cmpi(arith.CmpIPredicate.ult, t2, tokens_i32_v) + t_idx = arith.index_cast(T.index, t2) + s_idx = arith.index_cast(T.index, s2) + ts_idx = t_idx * arith.index(topk) + s_idx + if const_expr(_needs_global_atomic_bf16): + row_byte_base = out_base_idx + ts_idx * arith.index(_split_k_out_row_stride) + return (row_byte_base, t_ok) + else: + row_byte_off = ts_idx * arith.index(_split_k_out_row_stride) + return (row_byte_off, t_ok) + + def store_pair_splitk(*, row_local, row, row_ctx, col_pair0, col_g0, frag): + # `row_ctx` here is the *first* element of the tuple returned by + # precompute_row_splitk (c_shuffle_epilog destructures (ctx, pred)). + row_byte_ctx = row_ctx + col_idx = col_g0 + arith.index(_split_k_n_offset[0]) + byte_off_col = col_idx * arith.index(out_elem_bytes) + if const_expr(_needs_global_atomic_bf16): + # gfx94X: global atomicrmw fadd + ptr_addr_idx = row_byte_ctx + byte_off_col + out_ptr = buffer_ops.create_llvm_ptr(ptr_addr_idx, address_space=1) + out_ptr_v = out_ptr._value if hasattr(out_ptr, "_value") else out_ptr + frag_v = frag._value if hasattr(frag, "_value") else frag + llvm.AtomicRMWOp( + llvm.AtomicBinOp.fadd, + out_ptr_v, + frag_v, + llvm.AtomicOrdering.monotonic, + syncscope="agent", + alignment=_split_k_e_vec * out_elem_bytes, + ) + else: + # f16: buffer_atomic_pk_add_f16 (all archs) + # bf16 on gfx95+/gfx12: buffer_atomic_pk_add_bf16 + byte_off_i32 = arith.index_cast(T.i32, row_byte_ctx + byte_off_col) + rocdl.raw_ptr_buffer_atomic_fadd( + frag, + out_rsrc, + byte_off_i32, + _splitk_zero_i32, + _splitk_zero_i32, + ) + + _cshuffle_nlane_splitk = min(32, tile_n // _split_k_e_vec) + _splitk_frag_elem = ir.BF16Type.get() if _splitk_out_is_bf16 else ir.F16Type.get() + + # Pass 1: gate (offset=0) + _split_k_acc[0] = acc_gate + _split_k_n_offset[0] = 0 + c_shuffle_epilog( + arith=arith, + vector=vector, + gpu=gpu, + scf=scf, + range_constexpr=range_constexpr, + tile_m=tile_m, + tile_n=tile_n, + e_vec=_split_k_e_vec, + cshuffle_nlane=_cshuffle_nlane_splitk, + block_size=total_threads, + m_repeat=m_repeat, + num_acc_n=num_acc_n, + tx=tx, + lane_div_16=lane_div_16, + lane_mod_16=lane_mod_16, + bx_m=bx_m, + by_n=by_n, + n_tile_base=n_tile_base, + lds_out=lds_out, + frag_elem_type=_splitk_frag_elem, + write_row_to_lds=write_row_to_lds_splitk, + precompute_row=precompute_row_splitk, + store_pair=store_pair_splitk, + ) + + gpu.barrier() + + # Pass 2: up (offset=inter_dim) + _split_k_acc[0] = acc_up + _split_k_n_offset[0] = inter_dim + c_shuffle_epilog( + arith=arith, + vector=vector, + gpu=gpu, + scf=scf, + range_constexpr=range_constexpr, + tile_m=tile_m, + tile_n=tile_n, + e_vec=_split_k_e_vec, + cshuffle_nlane=_cshuffle_nlane_splitk, + block_size=total_threads, + m_repeat=m_repeat, + num_acc_n=num_acc_n, + tx=tx, + lane_div_16=lane_div_16, + lane_mod_16=lane_mod_16, + bx_m=bx_m, + by_n=by_n, + n_tile_base=n_tile_base, + lds_out=lds_out, + frag_elem_type=_splitk_frag_elem, + write_row_to_lds=write_row_to_lds_splitk, + precompute_row=precompute_row_splitk, + store_pair=store_pair_splitk, + ) + return + + def write_row_to_lds( + *, + mi: int, + ii: int, + row_in_tile, + row, + row_base_lds, + col_base_local, + num_acc_n: int, + lds_out, + ): + # Blockscale: dequant already done in compute_tile_bs_s1. + # Just apply silu + optional sorted weight. + if const_expr(doweight_stage1): + tw = buffer_ops.buffer_load(sorted_w_rsrc, row, vec_width=1, dtype=T.f32) + + for ni in range_constexpr(num_acc_n): + col_local = col_base_local + (ni * 16) + + acc_idx = mi * num_acc_n + ni + vg = vector.extract(acc_gate[acc_idx], static_position=[ii], dynamic_position=[]) + vu = vector.extract(acc_up[acc_idx], static_position=[ii], dynamic_position=[]) + + y = silu(vg) * vu + if const_expr(doweight_stage1): + y = y * tw + y16 = arith.trunc_f(T.f16, y) + + lds_idx = row_base_lds + col_local + v1 = vector.from_elements(T.vec(1, T.f16), [y16]) + vector.store(v1, lds_out, [lds_idx], alignment=2) + + def precompute_row(*, row_local, row): + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + t2 = fused2 & mask24_i32 + s2 = fused2 >> 24 + return (t2 * topk_i32_v + s2) * inter_i32_local + + def store_pair(*, row_local, row, row_ctx, col_pair0, col_g0, frag): + # Guard against sentinel token ids (t == tokens) produced by aiter moe_sorting padding. + # OOB buffer stores are not guaranteed to be safe on all paths, so predicate explicitly. + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + t2 = fused2 & mask24_i32 + t_valid = arith.cmpi(arith.CmpIPredicate.ult, t2, tokens_i32_v) + _if_valid = scf.IfOp(t_valid) + with _if_then(_if_valid): + idx0 = row_ctx + col_i32 = arith.index_cast(T.i32, col_g0) + idx_out = idx0 + col_i32 + # Vectorized fp16 store (EVec=4). + buffer_ops.buffer_store(frag, out_rsrc, idx_out) + + mfma_epilog( + use_cshuffle=True, + arith=arith, + vector=vector, + gpu=gpu, + scf=scf, + range_constexpr=range_constexpr, + tile_m=tile_m, + tile_n=tile_n, + e_vec=4, + m_repeat=m_repeat, + num_acc_n=num_acc_n, + tx=tx, + lane_div_16=lane_div_16, + lane_mod_16=lane_mod_16, + bx_m=bx_m, + by_n=by_n, + n_tile_base=n_tile_base, + lds_out=lds_out, + write_row_to_lds=write_row_to_lds, + precompute_row=precompute_row, + store_pair=store_pair, + ) + return + + def _stage1_store_row(*, mi: int, ii: int, row_in_tile, row): + # Blockscale: dequant already done in compute_tile_bs_s1. + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + t2 = fused2 & mask24_i32 + s2 = fused2 >> 24 + t_valid = arith.cmpi(arith.CmpIPredicate.ult, t2, tokens_i32_v) + + # out linear index base = ((t*topk + s)*inter_dim) (invariant across ni) + idx0 = (t2 * topk_i32_v + s2) * inter_i32_local + + # Sorted weight aligned with `row` (matches aiter moe_sorting output). + if const_expr(doweight_stage1): + tw = buffer_ops.buffer_load(sorted_w_rsrc, row, vec_width=1, dtype=T.f32) + + _if_valid = scf.IfOp(t_valid) + with _if_then(_if_valid): + for ni in range_constexpr(num_acc_n): + col_i32 = col_i32_list[ni] + + acc_idx = mi * num_acc_n + ni + vg = vector.extract(acc_gate[acc_idx], static_position=[ii], dynamic_position=[]) + vu = vector.extract(acc_up[acc_idx], static_position=[ii], dynamic_position=[]) + + y = silu(vg) * vu + if const_expr(doweight_stage1): + y = y * tw + y = arith.trunc_f(out_mlir(), y) + idx_out0 = idx0 + col_i32 + buffer_ops.buffer_store(y, out_rsrc, idx_out0) + + mfma_epilog( + use_cshuffle=False, + arith=arith, + range_constexpr=range_constexpr, + m_repeat=m_repeat, + lane_div_16=lane_div_16, + bx_m=bx_m, + body_row=_stage1_store_row, + ) + + # ── Host launcher (flyc.jit + .launch) ──────────────────────────────── + @flyc.jit + def launch_moe_blockscale_gemm1( + arg_out: fx.Tensor, + arg_x: fx.Tensor, + arg_w: fx.Tensor, + arg_scale_x: fx.Tensor, + arg_scale_w: fx.Tensor, + arg_sorted_token_ids: fx.Tensor, + arg_expert_ids: fx.Tensor, + arg_sorted_weights: fx.Tensor, + arg_max_token_ids: fx.Tensor, + i32_tokens_in: fx.Int32, + i32_inter_in: fx.Int32, + i32_k_in: fx.Int32, + i32_size_expert_ids_in: fx.Int32, + stream: fx.Stream, + ): + allocator.finalized = False + ctx = CompilationContext.get_current() + with ir.InsertionPoint(ctx.gpu_module_body): + allocator.finalize() + + inter_in = arith.index_cast(T.index, i32_inter_in) + size_expert_ids_in = arith.index_cast(T.index, i32_size_expert_ids_in) + gx = inter_in // fx.Index(tile_n) + gy = size_expert_ids_in + + moe_blockscale_gemm1( + arg_out, + arg_x, + arg_w, + arg_scale_x, + arg_scale_w, + arg_sorted_token_ids, + arg_expert_ids, + arg_sorted_weights, + arg_max_token_ids, + i32_tokens_in, + i32_inter_in, + i32_k_in, + i32_size_expert_ids_in, + value_attrs={"rocdl.waves_per_eu": waves_per_eu}, + ).launch(grid=(gx, gy, k_batch), block=(256, 1, 1), stream=stream) + + return launch_moe_blockscale_gemm1 + + +@functools.lru_cache(maxsize=1024) +def compile_moe_blockscale_gemm2( + *, + model_dim: int, + inter_dim: int, + experts: int, + topk: int, + tile_m: int, + tile_n: int, + tile_k: int, + doweight_stage2: bool, + scale_block_k: int = 128, + out_dtype: str = "f16", + use_cshuffle_epilog: bool | None = None, + # Optional experiment: write per-(token,slot) output (no atomics) into an output shaped + # [tokens*topk, model_dim] (or [tokens, topk, model_dim] flattened), then reduce over topk outside. + # This can reduce atomic contention for small tokens at the cost of extra bandwidth / reduction. + accumulate: bool = True, + waves_per_eu: int | None = None, +): + """Compile stage2 kernel (`moe_gemm2`) and return the compiled executable. + + in_dtype: + - "fp8": A2/W are fp8 + - "fp16": A2/W are fp16 + - "int8": A2/W are int8 + - "int4": W4A8 path: A2 is int8, W is packed int4 unpacked to int8 in-kernel + + Stage2 output supports: + - out_dtype="f16": fp16 half2 atomics (fast, can overflow to +/-inf for bf16 workloads) + - out_dtype="f32": fp32 scalar atomics (slower, but avoids fp16 atomic overflow) + + `use_cshuffle_epilog` controls whether we use the LDS CShuffle epilogue before + global atomics (recommended for performance). + """ + gpu_arch = get_hip_arch() + _is_gfx950 = str(gpu_arch).startswith("gfx95") + allocator = SmemAllocator(None, arch=gpu_arch) + _state = {} + + in_dtype = "fp8" # blockscale is FP8-only + is_f16 = in_dtype == "fp16" + elem_bytes = 2 if is_f16 else 1 + out_s = str(out_dtype).strip().lower() + if out_s not in ("f16", "fp16", "half", "bf16", "bfloat16", "f32", "fp32", "float"): + raise ValueError(f"out_dtype must be 'f16', 'bf16', or 'f32', got {out_dtype!r}") + out_is_f32 = out_s in ("f32", "fp32", "float") + out_is_bf16 = out_s in ("bf16", "bfloat16") + if (not bool(accumulate)) and out_is_f32: + raise ValueError("compile_moe_blockscale_gemm2(accumulate=False) only supports out_dtype in {'f16','bf16'}") + is_int4 = in_dtype == "int4" + # INT4 here means W4A8: A2 is int8, W is packed int4 and unpacked to int8 in-kernel. + is_int8 = (in_dtype in ("int8", "int8smooth")) or is_int4 + + # Blockscale compile-time constants (K=inter_dim for stage2) + if inter_dim % scale_block_k != 0: + raise ValueError(f"inter_dim ({inter_dim}) must be divisible by scale_block_k ({scale_block_k})") + if model_dim % 128 != 0: + raise ValueError(f"model_dim ({model_dim}) must be divisible by 128 (ScaleBlockN)") + sb_per_tile_s2 = tile_k // scale_block_k # scale blocks per tile (in K dim) + ku_per_sb_s2 = scale_block_k // 64 # K64-steps per scale block = 2 + nblk_k_w2 = inter_dim // scale_block_k # K-blocks in W2 (=scale_k) + model_dim // 128 # N-blocks in W2 (ScaleBlockN=128) + + mfma_i32_k32 = None + if is_int8: + mfma_i32_k32 = getattr(rocdl, "mfma_i32_16x16x32i8", None) or getattr(rocdl, "mfma_i32_16x16x32_i8", None) + if mfma_i32_k32 is None: + raise AttributeError( + "INT8 K32 MFMA op not found: expected `rocdl.mfma_i32_16x16x32i8` (or `rocdl.mfma_i32_16x16x32_i8`)." + ) + + ir.ShapedType.get_dynamic_size() + # W is packed int4 for W4A8: 2 values per byte. + (experts * model_dim * inter_dim) // 2 if is_int4 else (experts * model_dim * inter_dim) + + total_threads = 256 + tile_k_bytes = int(tile_k) * int(elem_bytes) + if (tile_k_bytes % 64) != 0: + raise ValueError( + f"tile_k_bytes must be divisible by 64, got tile_k_bytes={tile_k_bytes} " + f"(tile_k={tile_k}, elem_bytes={elem_bytes})" + ) + bytes_x_per_tile = int(tile_m) * int(tile_k) * int(elem_bytes) + if bytes_x_per_tile % total_threads != 0: + raise ValueError( + "tile_m*tile_k*elem_bytes must be divisible by " + f"{total_threads}: tile_m={tile_m}, tile_k={tile_k}, elem_bytes={elem_bytes}" + ) + bytes_per_thread_x = bytes_x_per_tile // total_threads + + _ck_lds128 = os.environ.get("FLYDSL_CK_LDS128", "1") in ("1", "true", "True", "YES", "yes") + pad_k = 0 if _ck_lds128 else 8 + lds_stride = tile_k + pad_k + # gfx950+ has buffer_atomic_pk_add_bf16 → bf16 can use buffer atomics (same as f16). + # gfx942 only has global_atomic_pk_add_bf16 → must use global atomics with raw pointer. + _has_buffer_atomic_bf16 = str(gpu_arch).startswith(("gfx95", "gfx12")) + _needs_global_atomic_bf16 = out_is_bf16 and not _has_buffer_atomic_bf16 + if out_is_bf16: + if not (gpu_arch.startswith("gfx942") or gpu_arch.startswith("gfx950") or gpu_arch.startswith("gfx12")): + raise ValueError( + f"out_dtype='bf16' requires bf16 global atomics (gfx942/gfx950/gfx12), got arch={gpu_arch!r}" + ) + + if out_is_f32: + # Match origin/dev_a16w4: f32 output uses scalar atomics and does NOT use the CShuffle epilogue. + _use_cshuffle_epilog = False if use_cshuffle_epilog is None else bool(use_cshuffle_epilog) + if _use_cshuffle_epilog: + raise ValueError("out_dtype='f32' does not support CShuffle epilogue (set use_cshuffle_epilog=False).") + else: + if use_cshuffle_epilog is None: + _use_cshuffle_epilog = os.environ.get("FLYDSL_MOE_STAGE2_CSHUFFLE", "1") in ( + "1", + "true", + "True", + "YES", + "yes", + ) + else: + _use_cshuffle_epilog = bool(use_cshuffle_epilog) + if not _use_cshuffle_epilog: + raise ValueError("stage2 f16 output currently requires CShuffle epilogue (FLYDSL_MOE_STAGE2_CSHUFFLE=1).") + + # NOTE: Keep this as a callable so we don't require an MLIR Context at Python-time. + def out_elem(): + ty = T.f32 if out_is_f32 else (T.bf16 if out_is_bf16 else T.f16) + return ty() if callable(ty) else ty + + epilog_tag = "cshuffle" + # IMPORTANT: include tiling in the module name to avoid accidentally reusing a compiled + # binary for a different (tile_m, tile_n, tile_k) configuration. + # See stage1 note: include ABI tag to prevent binary reuse across signature changes. + # IMPORTANT: module name participates in FlyDSL's compile cache key. + # Dynamic-shape variant: safe to reuse across (tokens/sorted_size/size_expert_ids) at runtime. + # Keep a distinct ABI tag so the compile cache never mixes with historical signatures. + _wpe_tag2 = f"_wpe{waves_per_eu}" if waves_per_eu is not None else "" + module_name = ( + f"mfma_moe2_{in_dtype}_{out_s}_{epilog_tag}" + f"_t{tile_m}x{tile_n}x{tile_k}{_wpe_tag2}" + f"_abi6" # scale prefetch before VMEM tile loads + ).replace("-", "_") + + # ── LDS sizing (pure Python; no MLIR Context needed) ───────────────────── + lds_x_bytes = 2 * int(tile_m) * int(lds_stride) * int(elem_bytes) + lds_out_bytes = 2 * int(tile_m) * int(tile_n) if _use_cshuffle_epilog else 0 + lds_total_bytes = max(lds_x_bytes, lds_out_bytes) + lds_total_elems = lds_total_bytes if elem_bytes == 1 else (lds_total_bytes // 2) + + lds_alloc_bytes = int(lds_total_elems) * int(elem_bytes) + lds_alloc_offset = allocator._align(allocator.ptr, 16) + allocator.ptr = lds_alloc_offset + lds_alloc_bytes + + _cshuffle_nlane = 32 + if bool(accumulate): + _e_vec = 2 + else: + _e_vec = 8 if int(tile_n) % (_cshuffle_nlane * 8) == 0 else 2 + _cshuffle_stride = _cshuffle_nlane * _e_vec + if int(tile_n) % _cshuffle_stride != 0: + raise ValueError(f"tile_n={tile_n} must be divisible by {_cshuffle_stride} when accumulate=False") + + if True: + + @flyc.kernel(name=module_name) + def moe_blockscale_gemm2( + arg_out: fx.Tensor, + arg_x: fx.Tensor, + arg_w: fx.Tensor, + arg_scale_x: fx.Tensor, + arg_scale_w: fx.Tensor, + arg_sorted_token_ids: fx.Tensor, + arg_expert_ids: fx.Tensor, + arg_sorted_weights: fx.Tensor, + arg_num_valid_ids: fx.Tensor, + i32_tokens_in: fx.Int32, + i32_n_in: fx.Int32, + i32_k_in: fx.Int32, + i32_size_expert_ids_in: fx.Int32, + ): + tokens_in = arith.index_cast(T.index, i32_tokens_in) + n_in = arith.index_cast(T.index, i32_n_in) + k_in = arith.index_cast(T.index, i32_k_in) + size_expert_ids_in = arith.index_cast(T.index, i32_size_expert_ids_in) + k_i32_v = i32_k_in + x_elem = T.f16 if is_f16 else (T.i8 if is_int8 else T.f8) + # For int4, weights are stored as packed bytes (i8) and unpacked to i8 packs. + w_elem = T.f16 if is_f16 else (T.i8 if is_int8 else T.f8) + vec16_elems = 16 if elem_bytes == 1 else 8 + vec8_elems = 8 if elem_bytes == 1 else 4 + vec8_x = T.vec(vec8_elems, x_elem) + vec16_x = T.vec(vec16_elems, x_elem) + + acc_init = arith.constant_vector(0, T.i32x4) if is_int8 else arith.constant_vector(0.0, T.f32x4) + + # A2 layout (flatten token-slot -> M). + topk_idx = fx.Index(topk) + m_in = tokens_in * topk_idx + m_i32_v = arith.index_cast(T.i32, m_in) + fx.make_layout((m_i32_v, k_i32_v), stride=(k_i32_v, 1)) + + # B preshuffle layout: [experts*model_dim, inter_dim] + c_n_total = arith.index(experts * model_dim) + kpack_bytes = 8 if is_int4 else 16 + b_layout = make_preshuffle_b_layout( + arith, c_n=c_n_total, c_k=k_in, kpack_bytes=kpack_bytes, elem_bytes=elem_bytes + ) + layout_b = b_layout.layout_b + (k_in * arith.index(int(elem_bytes))) // fx.Index(64) + + shape_lds = fx.make_shape(tile_m, tile_k) + stride_lds = fx.make_stride(lds_stride, 1) + layout_lds = fx.make_layout(shape_lds, stride_lds) + + tx = gpu.thread_id("x") + # Align with Aiter launch mapping: + # - blockIdx.x -> N dimension (tile along model_dim) + # - blockIdx.y -> expert-block id / M dimension (tile along sorted M) + by = gpu.block_id("x") # tile along model_dim + bx = gpu.block_id("y") # tile along sorted M + + # XOR16 swizzle parameter (in bytes; constant, power-of-two in our configs). + k_blocks16 = arith.index(tile_k_bytes // 16) + layout_tx_wave_lane = fx.make_layout((4, 64), stride=(64, 1)) + layout_lane16 = fx.make_layout((4, 16), stride=(16, 1)) + fx.make_layout((tile_m, tile_k), stride=(tile_k, 1)) + + base_ptr = allocator.get_base() + lds_x_ptr = SmemPtr( + base_ptr, + lds_alloc_offset, + (T.f16 if is_f16 else (T.i8 if is_int8 else T.f8)), + shape=(lds_total_elems,), + ) + lds_x = lds_x_ptr.get() + # Alias the same underlying LDS bytes as f16/bf16 for epilogue shuffle. + lds_out = ( + SmemPtr( + base_ptr, + lds_x_ptr.byte_offset, + (T.bf16 if out_is_bf16 else T.f16), + shape=(tile_m * tile_n,), + ).get() + if _use_cshuffle_epilog + else None + ) + + # Buffer resources. + # For dynamic memrefs, `max_size=False` cannot infer the logical size from the memref *type*, + # so we should pass `num_records_bytes` explicitly for stable hardware OOB behavior. + c_topk = fx.Index(topk) + + # X(A2): [tokens*topk, inter_dim] bytes = tokens*topk*k*elem_bytes + x_nbytes_idx = (tokens_in * c_topk) * k_in * arith.index(int(elem_bytes)) + x_rsrc = buffer_ops.create_buffer_resource( + arg_x, max_size=False, num_records_bytes=arith.index_cast(T.i64, x_nbytes_idx) + ) + + w_rsrc = buffer_ops.create_buffer_resource(arg_w, max_size=False) + + # OUT: [tokens, model_dim] -> clamp to descriptor max (i32 bytes) to avoid overflow on huge tokens. + out_elem_bytes = 4 if out_is_f32 else 2 + out_nbytes_idx = tokens_in * n_in * fx.Index(out_elem_bytes) + if const_expr(not bool(accumulate)): + out_nbytes_idx = tokens_in * fx.Index(topk) * n_in * fx.Index(out_elem_bytes) + out_rsrc = buffer_ops.create_buffer_resource( + arg_out, max_size=False, num_records_bytes=arith.index_cast(T.i64, out_nbytes_idx) + ) + # fp16 path ignores scales completely (implicit scale=1.0). + sx_rsrc = -1 + sw_rsrc = -1 + if const_expr(not is_f16): + # scale_x (A2 scale): [nblk_k_w2, tokens*topk] f32 transposed -> total = nblk_k_w2 * tokens * topk + sx_nbytes_idx = arith.index(nblk_k_w2) * (tokens_in * c_topk) * fx.Index(4) + sx_rsrc = buffer_ops.create_buffer_resource( + arg_scale_x, max_size=False, num_records_bytes=arith.index_cast(T.i64, sx_nbytes_idx) + ) + # scale_w: [experts*model_dim] f32 (static shape in practice) + sw_rsrc = buffer_ops.create_buffer_resource(arg_scale_w, max_size=False) + + # sorted_token_ids / sorted_weights: [blocks*tile_m] (CK-style padded length) + sorted_nbytes_idx = size_expert_ids_in * fx.Index(tile_m) * fx.Index(4) + sorted_nbytes_i64 = arith.index_cast(T.i64, sorted_nbytes_idx) + sorted_rsrc = buffer_ops.create_buffer_resource( + arg_sorted_token_ids, max_size=False, num_records_bytes=sorted_nbytes_i64 + ) + sorted_w_rsrc = buffer_ops.create_buffer_resource( + arg_sorted_weights, max_size=False, num_records_bytes=sorted_nbytes_i64 + ) + + # expert ids: [blocks] i32 -> bytes = size_expert_ids_in*4 + eid_nbytes_idx = size_expert_ids_in * fx.Index(4) + expert_rsrc = buffer_ops.create_buffer_resource( + arg_expert_ids, max_size=False, num_records_bytes=arith.index_cast(T.i64, eid_nbytes_idx) + ) + bx_m = bx * fx.Index(tile_m) + + # Early-exit guard (as in 2ce65fb): some routing paths can produce extra/garbage + # expert blocks beyond `num_valid_ids`. Skip those blocks entirely to avoid OOB. + numids_rsrc = buffer_ops.create_buffer_resource( + arg_num_valid_ids, max_size=False, num_records_bytes=fx.Index(4) + ) + num_valid_i32 = buffer_ops.buffer_load(numids_rsrc, fx.Index(0), vec_width=1, dtype=T.i32) + bx_m_i32 = arith.index_cast(T.i32, bx_m) + blk_valid = arith.cmpi(arith.CmpIPredicate.ult, bx_m_i32, num_valid_i32) + + def _moe_gemm2_then_body(): + # Expert id for this M tile. + expert_i32 = buffer_ops.buffer_load(expert_rsrc, bx, vec_width=1, dtype=T.i32) + expert_idx = arith.index_cast(T.index, expert_i32) + n_idx = fx.Index(model_dim) + expert_off_idx = expert_idx * n_idx # index + + # ---- X gmem->reg prefetch (match preshuffle GEMM mapping) ---- + # Prefer 16B buffer-load (dwordx4). If the per-thread byte count isn't divisible by + # 16, fall back to 8B (dwordx2) or 4B (dword) loads. For fp16 we require 16B. + x_load_bytes = 0 + if const_expr(is_f16): + if const_expr(bytes_per_thread_x % 16 != 0): + raise ValueError(f"[fp16] bytes_per_thread_x ({bytes_per_thread_x}) must be divisible by 16") + x_load_bytes = 16 + else: + if const_expr(bytes_per_thread_x % 16 == 0): + x_load_bytes = 16 + elif const_expr(bytes_per_thread_x % 8 == 0): + x_load_bytes = 8 + elif const_expr(bytes_per_thread_x % 4 == 0): + x_load_bytes = 4 + else: + raise ValueError( + f"bytes_per_thread_x ({bytes_per_thread_x}) must be divisible by 4 to use the dword-indexed load mapping." + ) + num_x_loads = bytes_per_thread_x // x_load_bytes + chunk_i32 = x_load_bytes // 4 # dwords per chunk (1/2/4) + + c_k_div4 = (k_in * arith.index(int(elem_bytes))) // fx.Index(4) + c_k_div4_i32 = arith.index_cast(T.i32, c_k_div4) + fx.make_layout((m_i32_v, c_k_div4_i32), stride=(c_k_div4_i32, 1)) + tile_k_dwords = (int(tile_k) * int(elem_bytes)) // 4 + layout_x_tile_div4 = fx.make_layout((tile_m, tile_k_dwords), stride=(tile_k_dwords, 1)) + c_chunk_i32 = fx.Index(chunk_i32) + tx_i32_base = tx * c_chunk_i32 + + topk_i32 = fx.Int32(topk) + mask24 = fx.Int32(0xFFFFFF) + # Sentinel clamp uses `tokens` as the upper bound: t_valid = (t < tokens). + tokens_i32 = arith.index_cast(T.i32, tokens_in) + + def x_tile_chunk_coord_i32(i: int): + return tile_chunk_coord_i32( + arith, + tx_i32_base=tx_i32_base, + i=i, + total_threads=total_threads, + layout_tile_div4=layout_x_tile_div4, + chunk_i32=chunk_i32, + ) + + T.vec(1, T.i32) + T.vec(2, T.i32) + vec4_x = T.vec(4, x_elem) + + def load_x(idx_i32, x_load_bytes_v): + if const_expr(x_load_bytes_v == 16): + idx_elem = idx_i32 if elem_bytes == 1 else (idx_i32 * fx.Index(2)) + return buffer_copy_gmem16_dwordx4( + buffer_ops, + vector, + elem_type=x_elem, + idx_i32=idx_elem, + rsrc=x_rsrc, + vec_elems=vec16_elems, + elem_bytes=elem_bytes, + ) + if const_expr(x_load_bytes_v == 8): + return buffer_ops.buffer_load(x_rsrc, idx_i32, vec_width=2, dtype=T.i32) + return buffer_ops.buffer_load(x_rsrc, idx_i32, vec_width=1, dtype=T.i32) + + # decode routed token once (per thread's M-slice) and build a base offset. + x_row_base_div4 = [] + x_col_local_i32 = [] + x_row_local = [] + for i in range_constexpr(num_x_loads): + row_local, col_local_i32 = x_tile_chunk_coord_i32(i) + x_row_local.append(row_local) + x_col_local_i32.append(col_local_i32) + + sorted_row_i = bx_m + row_local + fused_i = buffer_ops.buffer_load(sorted_rsrc, sorted_row_i, vec_width=1, dtype=T.i32) + t_i32 = fused_i & mask24 + s_i32 = fused_i >> 24 + # aiter moe_sorting uses sentinel token_id == tokens for padding. + # Do NOT rely on buffer OOB semantics for A2/scale loads; explicitly mask. + t_valid = arith.cmpi(arith.CmpIPredicate.ult, t_i32, tokens_i32) + s_valid = arith.cmpi(arith.CmpIPredicate.ult, s_i32, topk_i32) + ts_valid = t_valid & s_valid + t_safe = ts_valid.select(t_i32, fx.Int32(0)) + s_safe = ts_valid.select(s_i32, fx.Int32(0)) + row_ts_i32 = t_safe * topk_i32 + s_safe + row_ts_idx = arith.index_cast(T.index, row_ts_i32) + # Base row offset in dword units: row_ts_idx * (k_in/4) + x_row_base_div4.append(row_ts_idx * c_k_div4) + + def load_x_tile(base_k, x_load_bytes_v): + base_k_div4 = (base_k * arith.index(int(elem_bytes))) // fx.Index(4) + parts = [] + for i in range_constexpr(num_x_loads): + idx_i32 = x_row_base_div4[i] + base_k_div4 + x_col_local_i32[i] + x_vec = load_x(idx_i32, x_load_bytes_v) + if const_expr(x_load_bytes_v == 16): + parts.append(vector.bitcast(T.i32x4, x_vec)) + elif const_expr(x_load_bytes_v == 8): + parts.append(x_vec) + else: + parts.append(x_vec) + return parts + + # tx -> wave/lane (GEMM-style decomposition). + coord_wl = fx.idx2crd(tx, layout_tx_wave_lane) + wave_id = fx.get(coord_wl, 0) + lane_id = fx.get(coord_wl, 1) + coord_l16 = fx.idx2crd(lane_id, layout_lane16) + lane_div_16 = fx.get(coord_l16, 0) + lane_mod_16 = fx.get(coord_l16, 1) + + row_a_lds = lane_mod_16 + a_kpack_elems = 16 // elem_bytes + col_offset_base = lane_div_16 * arith.index(int(a_kpack_elems)) + col_offset_base_bytes = ( + col_offset_base if elem_bytes == 1 else (col_offset_base * arith.index(int(elem_bytes))) + ) + + # Dynamic N tiling within block. + by_n = by * fx.Index(tile_n) + num_waves = 4 + n_per_wave = tile_n // num_waves + num_acc_n = n_per_wave // 16 + c_n_per_wave = fx.Index(n_per_wave) + wave_mod_4 = wave_id % fx.Index(4) + n_tile_base = wave_mod_4 * c_n_per_wave + + # Precompute (n_blk, n_intra) for B, and col indices for output. + n_intra_list = [] + n_blk_list = [] + col_g_list = [] + c_n0 = c_n_total // fx.Index(16) + c_n0_i32 = arith.index_cast(T.i32, c_n0) + layout_n_blk_intra = fx.make_layout((c_n0_i32, 16), stride=(16, 1)) + for ni in range_constexpr(num_acc_n): + offset = arith.index(ni * 16) + col_g = by_n + n_tile_base + offset + lane_mod_16 + col_g_list.append(col_g) + + row_w = expert_off_idx + col_g + coord_w = fx.idx2crd(row_w, layout_n_blk_intra) + n_blk_list.append(fx.get(coord_w, 0)) + n_intra_list.append(fx.get(coord_w, 1)) + + m_repeat = tile_m // 16 + k_unroll = tile_k_bytes // 64 # K64-byte micro-step (2x MFMA) + + # --- B Load Logic (K64) --- + def load_b_pack(base_k, ki_step, ni): + return load_b_pack_k32( + buffer_ops, + arith, + vector, + arg_b=arg_w, + b_rsrc=w_rsrc, + layout_b=layout_b, + base_k=base_k, + ki_step=ki_step, + n_blk=n_blk_list[ni], + n_intra=n_intra_list[ni], + lane_div_16=lane_div_16, # 0..3 + elem_type=w_elem, + kpack_bytes=kpack_bytes, + elem_bytes=elem_bytes, + unpack_int4=is_int4, + ) + + def load_b_tile(base_k): + """Prefetch the entire per-thread B tile (gmem -> regs) for a given K base. + + Returns a list of length `k_unroll`, where each entry is a tuple: + (packs_half0[ni], packs_half1[ni]) for the K64 micro-step. + """ + b_tile = [] + for ku in range_constexpr(k_unroll): + packs0 = [] + packs1 = [] + for ni in range_constexpr(num_acc_n): + ki0 = (ku * 2) + 0 + ki1 = (ku * 2) + 1 + b0 = load_b_pack(base_k, ki0, ni) + b1 = load_b_pack(base_k, ki1, ni) + packs0.append(b0) + packs1.append(b1) + b_tile.append((packs0, packs1)) + return b_tile + + # ---- Pipeline helpers: store X tile to LDS with ping-pong base ---- + def store_x_tile_to_lds(vec_x_in_parts, lds_base, x_load_bytes_v): + for i in range_constexpr(num_x_loads): + row_local = x_row_local[i] + col_local_i32 = x_col_local_i32[i] + if const_expr(x_load_bytes_v == 16): + lds_store_16b_xor16( + arith, + vector, + lds_memref=lds_x, + vec16_ty=vec16_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x4=vec_x_in_parts[i], + elem_bytes=elem_bytes, + ) + elif const_expr(x_load_bytes_v == 8): + lds_store_8b_xor16( + arith, + vector, + lds_memref=lds_x, + vec8_ty=vec8_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x2=vec_x_in_parts[i], + ) + else: + lds_store_4b_xor16( + arith, + vector, + lds_memref=lds_x, + vec4_ty=vec4_x, + layout_lds=layout_lds, + row_local=row_local, + col_local_i32=col_local_i32, + tx_c4=fx.Index(4), + k_blocks16=k_blocks16, + lds_base=lds_base, + vec_part_i32x1=vec_x_in_parts[i], + ) + + # --- A LDS load helper for K64 (load 16B once, extract 2x i64 halves) --- + def lds_load_packs_k64(curr_row_a_lds, col_base_bytes, lds_base): + col_base_swz_bytes = swizzle_xor16(curr_row_a_lds, col_base_bytes, k_blocks16) + col_base_swz = ( + col_base_swz_bytes if elem_bytes == 1 else (col_base_swz_bytes // arith.index(int(elem_bytes))) + ) + idx_a16 = crd2idx((curr_row_a_lds, col_base_swz), layout_lds) + idx_a16 = idx_a16 + lds_base + loaded_a16 = vector.load_op(vec16_x, lds_x, [idx_a16]) + a_i64x2 = vector.bitcast(T.i64x2, loaded_a16) + a0 = vector.extract(a_i64x2, static_position=[0], dynamic_position=[]) + a1 = vector.extract(a_i64x2, static_position=[1], dynamic_position=[]) + return a0, a1 + + # --- Blockscale pre-decode and helpers (stage2) --- + c_scale_block_k_s2 = fx.Index(scale_block_k) + c_128_s2 = fx.Index(128) + c_nblk_k_w2 = fx.Index(nblk_k_w2) + row_off_base_s2 = lane_div_16 * fx.Index(4) + fx.Index(model_dim) + + # Pre-decode sorted token IDs for stage2 (constant across all K-tiles). + # OOB buffer loads return 0, so no validity masking needed for scale values. + _pre_ts_safe_i32_s2 = [] + for _mi in range_constexpr(m_repeat): + _mi_safe = [] + for _ii in range_constexpr(4): + _row_in_tile = arith.index(_mi * 16) + row_off_base_s2 + fx.Index(_ii) + _sorted_row = bx_m + _row_in_tile + _fused_pre = buffer_ops.buffer_load(sorted_rsrc, _sorted_row, vec_width=1, dtype=T.i32) + _t_id_pre = _fused_pre & mask24 + _s_id_pre = _fused_pre >> 24 + _t_valid_pre = arith.cmpi(arith.CmpIPredicate.ult, _t_id_pre, tokens_i32) + _s_valid_pre = arith.cmpi(arith.CmpIPredicate.ult, _s_id_pre, topk_i32) + _ts_valid_pre = _t_valid_pre & _s_valid_pre + _t_safe_pre = _ts_valid_pre.select(_t_id_pre, fx.Int32(0)) + _s_safe_pre = _ts_valid_pre.select(_s_id_pre, fx.Int32(0)) + _ts_i32_pre = _t_safe_pre * topk_i32 + _s_safe_pre + _mi_safe.append(_ts_i32_pre) + _pre_ts_safe_i32_s2.append(_mi_safe) + + # Pre-compute N-block indices for scale_w (constant per CTA) + _pre_n_block_s2 = [] + for _ni in range_constexpr(num_acc_n): + _col_base_ni_pre = by_n + n_tile_base + arith.index(_ni * 16) + _pre_n_block_s2.append((expert_off_idx + _col_base_ni_pre) // c_128_s2) + + m_in_s2 = tokens_in * fx.Index(topk) + + def load_scales_s2(k_base): + all_combined = [] + for sb in range_constexpr(sb_per_tile_s2): + kb = k_base // c_scale_block_k_s2 + fx.Index(sb) + sa_base_offset = kb * m_in_s2 + + s_a_vecs = [] + sa_base_i32 = arith.index_cast(T.i32, sa_base_offset) + for mi in range_constexpr(m_repeat): + s_a_row = [] + for ii in range_constexpr(4): + ts_safe_i32 = _pre_ts_safe_i32_s2[mi][ii] + sa_idx_i32 = sa_base_i32 + ts_safe_i32 + sa_idx = arith.index_cast(T.index, sa_idx_i32) + s_a_val = buffer_ops.buffer_load(sx_rsrc, sa_idx, vec_width=1, dtype=T.f32) + s_a_row.append(s_a_val) + s_a_vecs.append(s_a_row) + + _sw_shared_n_s2 = n_per_wave <= 128 + s_w_vals = [] + s_w = arith.constant(1.0, type=T.f32) + for ni in range_constexpr(num_acc_n): + if const_expr(ni == 0 or not _sw_shared_n_s2): + sw_idx = _pre_n_block_s2[ni] * c_nblk_k_w2 + kb + s_w = buffer_ops.buffer_load(sw_rsrc, sw_idx, vec_width=1, dtype=T.f32) + s_w_vals.append(s_w) + + s_a_vec4_list = [] + for mi in range_constexpr(m_repeat): + s_a_vec4_list.append(vector.from_elements(T.f32x4, s_a_vecs[mi])) + all_combined.append((s_a_vec4_list, s_w_vals)) + return all_combined + + def compute_tile_bs_s2(acc_in, b_tile_in, lds_base, pre_scales, *, a0_prefetch=None): + current_acc = list(acc_in) + mfma_res_ty = T.f32x4 + + if const_expr(_is_gfx950): + + def _pack128(x0, x1, x2, x3): + v4 = vector.from_elements(T.vec(4, T.i64), [x0, x1, x2, x3]) + return vector.bitcast(T.vec(8, T.i32), v4) + + for sb in range_constexpr(sb_per_tile_s2): + s_a_vec4_list, s_w_vals = pre_scales[sb] + ku0 = sb * ku_per_sb_s2 + ku1 = ku0 + 1 + b0_p0, b0_p1 = b_tile_in[ku0] + b1_p0, b1_p1 = b_tile_in[ku1] + col0 = col_offset_base_bytes + arith.index(ku0 * 64) + col1 = col_offset_base_bytes + arith.index(ku1 * 64) + for mi in range_constexpr(m_repeat): + curr_row = row_a_lds + arith.index(mi * 16) + a0 = arith.constant(0, type=T.i64) + a1 = arith.constant(0, type=T.i64) + if const_expr(a0_prefetch is not None and sb == 0 and mi == 0): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64(curr_row, col0, lds_base) + a2, a3 = lds_load_packs_k64(curr_row, col1, lds_base) + a128 = _pack128(a0, a1, a2, a3) + s_a_v4 = s_a_vec4_list[mi] + pending_acc = None + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + b128 = _pack128(b0_p0[ni], b0_p1[ni], b1_p0[ni], b1_p1[ni]) + blk = rocdl.mfma_scale_f32_16x16x128_f8f6f4( + mfma_res_ty, [a128, b128, acc_init, 0, 0, 0, 0x7F7F7F7F, 0, 0x7F7F7F7F] + ) + rocdl.sched_barrier(0) + if const_expr(pending_acc is not None): + prev_acc_idx, prev_blk, prev_ni = pending_acc + s_w_bc = vector.broadcast(T.f32x4, s_w_vals[prev_ni]) + scale = ArithValue(s_a_v4) * ArithValue(s_w_bc) + current_acc[prev_acc_idx] = math_dialect.fma( + prev_blk, scale, current_acc[prev_acc_idx] + ) + pending_acc = (acc_idx, blk, ni) + if const_expr(pending_acc is not None): + prev_acc_idx, prev_blk, prev_ni = pending_acc + s_w_bc = vector.broadcast(T.f32x4, s_w_vals[prev_ni]) + scale = ArithValue(s_a_v4) * ArithValue(s_w_bc) + current_acc[prev_acc_idx] = math_dialect.fma( + prev_blk, scale, current_acc[prev_acc_idx] + ) + else: + mfma_fn = ( + mfma_i32_k32 + if const_expr(is_int8) + else (rocdl.mfma_f32_16x16x16f16 if is_f16 else rocdl.mfma_f32_16x16x32_fp8_fp8) + ) + + def _i64_to_v4f16(x_i64): + v1 = vector.from_elements(T.vec(1, T.i64), [x_i64]) + return vector.bitcast(T.f16x4, v1) + + def mfma_k64(acc0, a0, a1, b0, b1): + if const_expr(is_f16): + a0v = _i64_to_v4f16(a0) + a1v = _i64_to_v4f16(a1) + b0v = _i64_to_v4f16(b0) + b1v = _i64_to_v4f16(b1) + acc1 = mfma_fn(mfma_res_ty, [a0v, b0v, acc0, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1v, b1v, acc1, 0, 0, 0]) + acc1 = mfma_fn(mfma_res_ty, [a0, b0, acc0, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1, b1, acc1, 0, 0, 0]) + + for sb in range_constexpr(sb_per_tile_s2): + s_a_vec4_list, s_w_vals = pre_scales[sb] + for mi in range_constexpr(m_repeat): + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + blk = acc_init + for ku_local in range_constexpr(ku_per_sb_s2): + ku = sb * ku_per_sb_s2 + ku_local + b_packs0, b_packs1 = b_tile_in[ku] + ki64 = arith.index(ku * 64) + col_base = col_offset_base_bytes + ki64 + a0 = arith.constant(-1, type=T.i64) + a1 = arith.constant(-1, type=T.i64) + if const_expr( + (a0_prefetch is not None) and (sb == 0) and (ku_local == 0) and (mi == 0) + ): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64( + row_a_lds + arith.index(mi * 16), col_base, lds_base + ) + blk = mfma_k64(blk, a0, a1, b_packs0[ni], b_packs1[ni]) + s_w_bc = vector.broadcast(T.f32x4, s_w_vals[ni]) + scale = ArithValue(s_a_vec4_list[mi]) * ArithValue(s_w_bc) + current_acc[acc_idx] = math_dialect.fma(blk, scale, current_acc[acc_idx]) + return current_acc + + def compute_tile(acc_in, b_tile_in, lds_base, *, prefetch_epilogue: bool = False, a0_prefetch=None): + acc_list = list(acc_in) + mfma_res_ty = T.i32x4 if is_int8 else T.f32x4 + mfma_fn = ( + mfma_i32_k32 + if is_int8 + else (rocdl.mfma_f32_16x16x16f16 if is_f16 else rocdl.mfma_f32_16x16x32_fp8_fp8) + ) + + epilogue_pf = None + if const_expr(prefetch_epilogue): + expert_off_pf = expert_off_idx + sw_pf = [] + for ni in range_constexpr(num_acc_n): + col_g = col_g_list[ni] + row_w_idx = expert_off_pf + col_g + sw_pf.append( + fx.Float32(1.0) + if is_f16 + else buffer_ops.buffer_load(sw_rsrc, row_w_idx, vec_width=1, dtype=T.f32) + ) + # Also prefetch per-row routed/topk weights (sorted_weights) when enabled. + tw_pf = None + if const_expr(doweight_stage2): + tw_pf = [] + lane_div_16_mul4_pf = lane_div_16 * fx.Index(4) + ii_idx_list_pf = [fx.Index(ii) for ii in range(4)] + for mi in range_constexpr(m_repeat): + mi_base_pf = arith.index(mi * 16) + for ii in range_constexpr(4): + row_off_pf = lane_div_16_mul4_pf + ii_idx_list_pf[ii] + row_in_tile_pf = mi_base_pf + row_off_pf + sorted_row_pf = bx_m + row_in_tile_pf + tw_pf.append( + buffer_ops.buffer_load(sorted_w_rsrc, sorted_row_pf, vec_width=1, dtype=T.f32) + ) + epilogue_pf = (sw_pf, tw_pf) + + def _i64_to_v4f16(x_i64): + v1 = vector.from_elements(T.vec(1, T.i64), [x_i64]) + return vector.bitcast(T.f16x4, v1) + + def mfma_k64(acc0, a0, a1, b0, b1): + if const_expr(is_f16): + a0v = _i64_to_v4f16(a0) + a1v = _i64_to_v4f16(a1) + b0v = _i64_to_v4f16(b0) + b1v = _i64_to_v4f16(b1) + acc1 = mfma_fn(mfma_res_ty, [a0v, b0v, acc0, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1v, b1v, acc1, 0, 0, 0]) + acc1 = mfma_fn(mfma_res_ty, [a0, b0, acc0, 0, 0, 0]) + return mfma_fn(mfma_res_ty, [a1, b1, acc1, 0, 0, 0]) + + for ku in range_constexpr(k_unroll): + b_packs0, b_packs1 = b_tile_in[ku] + ki64 = arith.index(ku * 64) + col_base = col_offset_base_bytes + ki64 + + for mi in range_constexpr(m_repeat): + mi_val = arith.index(mi * 16) + curr_row_a_lds = row_a_lds + mi_val + + a0 = arith.constant(-1, type=T.i64) + a1 = arith.constant(-1, type=T.i64) + if const_expr((a0_prefetch is not None) and (ku == 0) and (mi == 0)): + a0, a1 = a0_prefetch + else: + a0, a1 = lds_load_packs_k64(curr_row_a_lds, col_base, lds_base) + + for ni in range_constexpr(num_acc_n): + acc_idx = mi * num_acc_n + ni + acc_list[acc_idx] = mfma_k64( + acc_list[acc_idx], + a0, + a1, + b_packs0[ni], + b_packs1[ni], + ) + return acc_list, epilogue_pf + + # ---------------- 2-stage pipeline (ping-pong LDS + B tile prefetch) ---------------- + lds_tile_elems = arith.index(tile_m * lds_stride) + lds_base_cur = fx.Index(0) + lds_base_nxt = lds_tile_elems + + rocdl.sched_barrier(0) + + # def hot_loop_scheduler(): + # mfma_group = num_acc_n + # # K64 micro-step: 2x K32 MFMA per accumulator update. + # mfma_total = (k_unroll * 2) * m_repeat * mfma_group + # mfma_per_iter = 2 * mfma_group + # sche_iters = 0 if mfma_per_iter == 0 else (mfma_total // mfma_per_iter) + # rocdl.sched_dsrd(2) + # rocdl.sched_mfma(1) + # rocdl.sched_mfma(1) + # if num_acc_n < 4: + # rocdl.sched_dsrd(1) + # rocdl.sched_mfma(1) + # rocdl.sched_dsrd(1) + # rocdl.sched_mfma(1) + # rocdl.sched_vmem(1) + # rocdl.sched_mfma(1) + # rocdl.sched_vmem(1) + # rocdl.sched_mfma(2) + # rocdl.sched_dsrd(1) + # rocdl.sched_mfma(2) + # rocdl.sched_vmem(1) + + # dswr_tail = num_x_loads + # if dswr_tail > sche_iters: + # dswr_tail = sche_iters + # dswr_start = sche_iters - dswr_tail + # for sche_i in range_constexpr(sche_iters): + # rocdl.sched_mfma(mfma_group // 2) + # rocdl.sched_dsrd(1) + # rocdl.sched_mfma(mfma_group // 2) + # rocdl.sched_vmem(1) + # rocdl.sched_mfma(mfma_group) + # if sche_i >= dswr_start - 1: + # rocdl.sched_dswr(1) + # rocdl.sched_barrier(0) + + def hot_loop_scheduler(): + mfma_per_ku = m_repeat * num_acc_n * 2 # m * n_acc * 2(k32) + total_mfma = k_unroll * mfma_per_ku + rocdl.sched_group_barrier(rocdl.mask_dsrd, ku_per_sb_s2 * m_repeat, 0) + rocdl.sched_group_barrier(rocdl.mask_mfma, total_mfma, 1) + rocdl.sched_group_barrier(rocdl.mask_vmem_rd, num_x_loads, 2) + rocdl.sched_group_barrier(rocdl.mask_dswr, num_x_loads, 3) + rocdl.sched_barrier(0) + + # Prologue. + k0 = fx.Index(0) + x_regs0 = load_x_tile(k0, x_load_bytes) + b_cur = load_b_tile(k0) + store_x_tile_to_lds(x_regs0, lds_base_cur, x_load_bytes) + gpu.barrier() + + acc = [arith.constant_vector(0.0, T.f32x4)] * (num_acc_n * m_repeat) + lds_base_pong = lds_base_cur + lds_base_ping = lds_base_nxt + + # Cross-tile A0 LDS prefetch (default-on): prefetch the first A-pack (K64) for the + # tile we are about to compute from LDS, to overlap with upcoming VMEM. + a0_prefetch_pong = lds_load_packs_k64(row_a_lds, col_offset_base_bytes, lds_base_pong) + + # Main loop: process K tiles in 2-tile ping-pong steps. + # + # IMPORTANT: for odd number of K tiles, leave **1** tail tile; for even, leave **2**. + # Otherwise the 2-tile tail below would double-count the last tile when num_tiles is odd + # (e.g. inter_dim=192, tile_k=64 -> 3 tiles). + num_k_tiles_py = int(inter_dim) // int(tile_k) + odd_k_tiles = (num_k_tiles_py % 2) == 1 + tail_tiles = 1 if odd_k_tiles else 2 + k_main2_py = (num_k_tiles_py - tail_tiles) * int(tile_k) + if const_expr(k_main2_py < 0): + k_main2_py = 0 + + c2_tile_k = arith.index(tile_k * 2) + pair_iters = k_main2_py // (int(tile_k) * 2) + for pair_i in range_constexpr(pair_iters): + k_iv = arith.index(pair_i * (tile_k * 2)) + # Issue scale loads FIRST so their latency hides behind heavy tile VMEM. + pre_scales_pong = load_scales_s2(k_iv) + next_k1 = k_iv + tile_k + x_regs_ping = load_x_tile(next_k1, x_load_bytes) + b_ping = load_b_tile(next_k1) + + acc = compute_tile_bs_s2(acc, b_cur, lds_base_pong, pre_scales_pong, a0_prefetch=a0_prefetch_pong) + a0_prefetch_pong = None + store_x_tile_to_lds(x_regs_ping, lds_base_ping, x_load_bytes) + hot_loop_scheduler() + gpu.barrier() + + # Cross-tile prefetch for the ping tile we are about to compute. + a0_prefetch_ping = lds_load_packs_k64(row_a_lds, col_offset_base_bytes, lds_base_ping) + + # Issue scale loads FIRST so their latency hides behind heavy tile VMEM. + pre_scales_ping = load_scales_s2(next_k1) + next_k2 = k_iv + c2_tile_k + x_regs_pong = load_x_tile(next_k2, x_load_bytes) + b_next = load_b_tile(next_k2) + + acc = compute_tile_bs_s2(acc, b_ping, lds_base_ping, pre_scales_ping, a0_prefetch=a0_prefetch_ping) + a0_prefetch_ping = None + store_x_tile_to_lds(x_regs_pong, lds_base_pong, x_load_bytes) + hot_loop_scheduler() + gpu.barrier() + + # Cross-tile prefetch for the next pong tile. + a0_prefetch_pong = lds_load_packs_k64(row_a_lds, col_offset_base_bytes, lds_base_pong) + + b_cur = b_next + + if const_expr(odd_k_tiles): + # Tail: single remaining tile (already in `b_cur` / `lds_base_pong`). + k_last = arith.index((num_k_tiles_py - 1) * int(tile_k)) + pre_scales_last = load_scales_s2(k_last) + acc = compute_tile_bs_s2( + acc, + b_cur, + lds_base_pong, + pre_scales_last, + a0_prefetch=a0_prefetch_pong, + ) + else: + # Tail: 2 remaining tiles. + k_tail0 = k_in - c2_tile_k + k_tail1 = k_in - tile_k + # Issue scale loads FIRST so their latency hides behind heavy tile VMEM. + pre_scales_tail0 = load_scales_s2(k_tail0) + x_regs_ping = load_x_tile(k_tail1, x_load_bytes) + b_ping = load_b_tile(k_tail1) + + acc = compute_tile_bs_s2(acc, b_cur, lds_base_pong, pre_scales_tail0, a0_prefetch=a0_prefetch_pong) + a0_prefetch_pong = None + store_x_tile_to_lds(x_regs_ping, lds_base_ping, x_load_bytes) + hot_loop_scheduler() + gpu.barrier() + + # Epilogue tile (blockscale already dequantized). + a0_prefetch_ping = lds_load_packs_k64(row_a_lds, col_offset_base_bytes, lds_base_ping) + pre_scales_tail1 = load_scales_s2(k_tail1) + acc = compute_tile_bs_s2(acc, b_ping, lds_base_ping, pre_scales_tail1, a0_prefetch=a0_prefetch_ping) + + # ---------------- Epilogue: LDS CShuffle + atomic half2 (x2) ---------------- + # Reuse the shared helper so GEMM / MoE kernels share the exact same CShuffle skeleton. + mask24_i32 = fx.Int32(0xFFFFFF) + model_i32 = fx.Int32(model_dim) + topk_i32_v = topk_i32 + + zero_i32 = fx.Int32(0) + c2_i32 = fx.Int32(2) # 2B element size for f16/bf16 + mask_even_i32 = fx.Int32(0xFFFFFFFE) # align element index to even for half2 atomics + + e_vec = _e_vec + + def atomic_add_f16x2(val_f16x2, byte_off_i32): + rocdl.raw_ptr_buffer_atomic_fadd( + val_f16x2, + out_rsrc, + byte_off_i32, + zero_i32, + zero_i32, + ) + + # Blockscale: dequant already done in compute_tile_bs_s2, no sw/sx needed here. + + if const_expr(out_is_f32): + # origin/dev_a16w4: f32 output uses scalar f32 atomics and skips CShuffle/LDS. + c4_i32 = fx.Int32(4) + + def atomic_add_f32(val_f32, byte_off_i32): + rocdl.raw_ptr_buffer_atomic_fadd( + val_f32, + out_rsrc, + byte_off_i32, + zero_i32, + zero_i32, + ) + + def _stage2_row_atomic(*, mi: int, ii: int, row_in_tile, row): + # Blockscale: dequant already done in compute_tile_bs_s2. + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + t2 = fused2 & mask24_i32 + fused2 >> 24 + + if const_expr(doweight_stage2): + tw = buffer_ops.buffer_load(sorted_w_rsrc, row, vec_width=1, dtype=T.f32) + + idx0 = t2 * model_i32 # i32 element index base + + for ni in range_constexpr(num_acc_n): + col_g = col_g_list[ni] + acc_idx = mi * num_acc_n + ni + v = vector.extract(acc[acc_idx], static_position=[ii], dynamic_position=[]) + if const_expr(doweight_stage2): + v = v * tw + col_i32 = arith.index_cast(T.i32, col_g) + idx_elem = idx0 + col_i32 + byte_off = idx_elem * c4_i32 + atomic_add_f32(v, byte_off) + + default_epilog( + arith=arith, + range_constexpr=range_constexpr, + m_repeat=m_repeat, + lane_div_16=lane_div_16, + bx_m=bx_m, + body_row=_stage2_row_atomic, + ) + else: + if const_expr(lds_out is None): + raise RuntimeError("FLYDSL_MOE_STAGE2_CSHUFFLE=1 but lds_out is not allocated/aliased.") + + # For bf16 global atomics (gfx942 only), precompute the output base address. + # gfx950+ has buffer_atomic_pk_add_bf16, so bf16 uses buffer atomics there. + out_base_idx = None + if const_expr(_needs_global_atomic_bf16): + out_base_idx = buffer_ops.extract_base_index(arg_out) + + def write_row_to_lds( + *, + mi: int, + ii: int, + row_in_tile, + row, + row_base_lds, + col_base_local, + num_acc_n: int, + lds_out, + ): + # Blockscale: dequant already done in compute_tile_bs_s2. + tw = arith.constant(1.0, type=T.f32) + if const_expr(doweight_stage2): + tw = buffer_ops.buffer_load(sorted_w_rsrc, row, vec_width=1, dtype=T.f32) + + for ni in range_constexpr(num_acc_n): + col_local = col_base_local + (ni * 16) + acc_idx = mi * num_acc_n + ni + v = vector.extract(acc[acc_idx], static_position=[ii], dynamic_position=[]) + if const_expr(doweight_stage2): + v = v * tw + v_out = arith.trunc_f(out_elem(), v) + + lds_idx = row_base_lds + col_local + vec1_out = T.vec(1, out_elem()) + v1 = vector.from_elements(vec1_out, [v_out]) + vector.store(v1, lds_out, [lds_idx], alignment=2) + + def precompute_row(*, row_local, row): + # Precompute row context for cshuffle stores. + # Return (fused_i32, row_valid_i1) so the epilogue can skip the entire row + # for invalid tail rows (CK-style), avoiding per-store branching. + fused2 = buffer_ops.buffer_load(sorted_rsrc, row, vec_width=1, dtype=T.i32) + row_i32 = arith.index_cast(T.i32, row) + row_valid0 = arith.cmpi(arith.CmpIPredicate.ult, row_i32, num_valid_i32) + t = fused2 & mask24_i32 + s = fused2 >> 24 + t_ok = arith.cmpi(arith.CmpIPredicate.ult, t, tokens_i32) + s_ok = arith.cmpi(arith.CmpIPredicate.ult, s, topk_i32_v) + row_valid = row_valid0 & t_ok & s_ok + return (fused2, row_valid) + + def store_pair(*, row_local, row, row_ctx, col_pair0, col_g0, frag): + fused = row_ctx + t = fused & mask24_i32 + s = fused >> 24 + idx0 = t * model_i32 + if const_expr(not bool(accumulate)): + ts = t * topk_i32_v + s + idx0 = ts * model_i32 + col_i32 = arith.index_cast(T.i32, col_g0) + idx_elem = idx0 + col_i32 + idx_elem_even = idx_elem & mask_even_i32 + if const_expr(_needs_global_atomic_bf16): + # gfx942: no buffer_atomic_pk_add_bf16, use global atomicrmw fadd + if const_expr(bool(accumulate)): + byte_off = idx_elem_even * c2_i32 + byte_off_idx = arith.index_cast(T.index, byte_off) + ptr_addr_idx = out_base_idx + byte_off_idx + out_ptr = buffer_ops.create_llvm_ptr(ptr_addr_idx, address_space=1) + out_ptr_v = out_ptr._value if hasattr(out_ptr, "_value") else out_ptr + frag_v = frag._value if hasattr(frag, "_value") else frag + llvm.AtomicRMWOp( + llvm.AtomicBinOp.fadd, + out_ptr_v, + frag_v, + llvm.AtomicOrdering.monotonic, + syncscope="agent", + alignment=4, + ) + else: + buffer_ops.buffer_store(frag, out_rsrc, idx_elem_even) + else: + # f16, or bf16 on gfx950+ (has buffer_atomic_pk_add_bf16) + byte_off = idx_elem_even * c2_i32 + if const_expr(bool(accumulate)): + atomic_add_f16x2(frag, byte_off) + else: + buffer_ops.buffer_store(frag, out_rsrc, idx_elem_even) + + c_shuffle_epilog( + arith=arith, + vector=vector, + gpu=gpu, + scf=scf, + range_constexpr=range_constexpr, + tile_m=tile_m, + tile_n=tile_n, + e_vec=e_vec, + m_repeat=m_repeat, + num_acc_n=num_acc_n, + tx=tx, + lane_div_16=lane_div_16, + lane_mod_16=lane_mod_16, + bx_m=bx_m, + by_n=by_n, + n_tile_base=n_tile_base, + lds_out=lds_out, + frag_elem_type=(T.bf16 if out_is_bf16 else T.f16), + write_row_to_lds=write_row_to_lds, + precompute_row=precompute_row, + store_pair=store_pair, + ) + + _if_blk = scf.IfOp(blk_valid) + with _if_then(_if_blk): + _moe_gemm2_then_body() + + # ── Host launcher (flyc.jit + .launch) ──────────────────────────────── + @flyc.jit + def launch_moe_blockscale_gemm2( + arg_out: fx.Tensor, + arg_x: fx.Tensor, + arg_w: fx.Tensor, + arg_scale_x: fx.Tensor, + arg_scale_w: fx.Tensor, + arg_sorted_token_ids: fx.Tensor, + arg_expert_ids: fx.Tensor, + arg_sorted_weights: fx.Tensor, + arg_num_valid_ids: fx.Tensor, + i32_tokens_in: fx.Int32, + i32_n_in: fx.Int32, + i32_k_in: fx.Int32, + i32_size_expert_ids_in: fx.Int32, + stream: fx.Stream, + ): + allocator.finalized = False + ctx = CompilationContext.get_current() + with ir.InsertionPoint(ctx.gpu_module_body): + allocator.finalize() + + n_in = arith.index_cast(T.index, i32_n_in) + size_expert_ids_in = arith.index_cast(T.index, i32_size_expert_ids_in) + gx = n_in // fx.Index(tile_n) + gy = size_expert_ids_in + + moe_blockscale_gemm2( + arg_out, + arg_x, + arg_w, + arg_scale_x, + arg_scale_w, + arg_sorted_token_ids, + arg_expert_ids, + arg_sorted_weights, + arg_num_valid_ids, + i32_tokens_in, + i32_n_in, + i32_k_in, + i32_size_expert_ids_in, + value_attrs={"rocdl.waves_per_eu": waves_per_eu}, + ).launch(grid=(gx, gy, 1), block=(256, 1, 1), stream=stream) + + return launch_moe_blockscale_gemm2 + + +# MoE Reduction Kernel (reduce sum over topk dimension) +@functools.lru_cache(maxsize=1024) +def compile_moe_reduction( + *, + topk: int, + model_dim: int, + dtype_str: str = "f16", + use_mask: bool = False, +): + """Compile a reduction kernel that sums over the topk dimension. + + Input: X [tokens, topk, model_dim] + valid_mask [tokens, topk] (optional, if use_mask=True) + Output: Y [tokens, model_dim] + + This kernel performs: Y[t, d] = sum(X[t, :, d]) for all t, d. + When use_mask=True, only sums slots where valid_mask[t,k]=1. + Used in conjunction with compile_moe_blockscale_gemm2(accumulate=False) to avoid atomic contention. + """ + get_hip_arch() + ir.ShapedType.get_dynamic_size() + + # Kernel Config + BLOCK_SIZE = 256 + VEC_WIDTH = 8 + + masked = "masked" if use_mask else "" + + module_name = f"bs_moe_reduce_topk{topk}_{dtype_str}{masked}" + + if dtype_str == "f32": + elem_type_tag = "f32" + elif dtype_str == "f16": + elem_type_tag = "f16" + elif dtype_str == "bf16": + elem_type_tag = "bf16" + else: + raise ValueError(f"Unsupported dtype: {dtype_str}") + compute_type = lambda: T.f32 + i8_type = lambda: T.i8 + + def elem_type(): + ty = T.f32 if elem_type_tag == "f32" else (T.f16 if elem_type_tag == "f16" else T.bf16) + return ty() if callable(ty) else ty + + if True: + + @flyc.kernel(name=module_name) + def moe_reduction_kernel( + X: fx.Tensor, + Y: fx.Tensor, + valid_mask: fx.Tensor, + i32_m_tokens: fx.Int32, + ): + m_tokens = fx.Index(i32_m_tokens) + c_topk = fx.Index(topk) + c_model_dim = fx.Index(model_dim) + mask_nbytes_idx = m_tokens * c_topk + elem_bits = 32 if dtype_str == "f32" else 16 + copy_vec_width = 128 // elem_bits # 8 for f16/bf16, 4 for f32 + n_sub = VEC_WIDTH // copy_vec_width # 1 for f16/bf16, 2 for f32 + # Buffer-backed tensors via layout API (all dtypes) + X_buf = fx.rocdl.make_buffer_tensor(X) + Y_buf = fx.rocdl.make_buffer_tensor(Y) + # Scalar buffer resources for tail path and mask + x_rsrc = buffer_ops.create_buffer_resource(X, max_size=True) + y_rsrc = buffer_ops.create_buffer_resource(Y, max_size=True) + mask_rsrc = buffer_ops.create_buffer_resource(valid_mask, max_size=False, num_records_bytes=mask_nbytes_idx) + + token_idx = gpu.block_id("x") + tile_idx = gpu.block_id("y") + tid = gpu.thread_id("x") + + # Guard: token in range (Index is unsigned → auto ult) + tok_ok = token_idx < m_tokens + _if_tok = scf.IfOp(tok_ok) + with _if_then(_if_tok): + tile_cols = BLOCK_SIZE * VEC_WIDTH + c_tile_cols = fx.Index(tile_cols) + c_vecw = fx.Index(VEC_WIDTH) + + col_base = tile_idx * c_tile_cols + tid * c_vecw + + # Guard: any work in bounds (Index < → ult) + col_ok = col_base < c_model_dim + _if_col = scf.IfOp(col_ok) + with _if_then(_if_col): + # Fast path: full vector in-bounds (Index <= → ule) + end_ok = col_base + c_vecw <= c_model_dim + _if_full = scf.IfOp(end_ok, has_else=True) + with _if_then(_if_full): + # ── Vector path via layout API (all dtypes) ── + # fx.copy auto-iterates when atom width < VEC_WIDTH + # (e.g. f32: BufferCopy128b handles 4, fx.copy issues 2 calls for 8) + copy_atom = fx.make_copy_atom(fx.rocdl.BufferCopy128b(), elem_bits) + vec_type_c = T.vec(copy_vec_width, compute_type()) + vec_type_e = T.vec(copy_vec_width, elem_type()) + + acc_vecs = [vector.broadcast(vec_type_c, fx.Float32(0.0).ir_value()) for _ in range(n_sub)] + elem_dtype = fx.Numeric.from_ir_type(elem_type()) + + tok_i32 = fx.Int32(token_idx) + tile_i32 = fx.Int32(tile_idx) + tid_i32 = fx.Int32(tid) + + for k in range_constexpr(topk): + # X[token, k, :] → tile → thread's VEC_WIDTH slice + x_row = X_buf[tok_i32, fx.Int32(k), None] + x_tiled = fx.logical_divide(x_row, fx.make_layout(tile_cols, 1)) + x_div = fx.logical_divide(x_tiled[None, tile_i32], fx.make_layout(VEC_WIDTH, 1)) + x_thread = x_div[None, tid_i32] + + if const_expr(use_mask): + m_idx_i32 = fx.Int32(token_idx * c_topk + fx.Index(k)) + mv = buffer_ops.buffer_load(mask_rsrc, m_idx_i32, vec_width=1, dtype=i8_type()) + mv_ok = mv != fx.Int8(0) + + if const_expr(n_sub > 1): + x_inner = fx.logical_divide(x_thread, fx.make_layout(copy_vec_width, 1)) + for si in range_constexpr(n_sub): + src = x_inner[None, fx.Int32(si)] if n_sub > 1 else x_thread + r = fx.make_rmem_tensor(copy_vec_width, elem_dtype) + fx.copy_atom_call(copy_atom, src, r) + vec_e = fx.memref_load_vec(r) + + if const_expr(use_mask): + zero_e = vector.broadcast(vec_type_e, arith.constant(0.0, type=elem_type())) + vec_e = mv_ok.select(vec_e, zero_e) + + if const_expr(elem_bits < 32): + vec_c = vec_e.extf(vec_type_c) + else: + vec_c = vec_e + acc_vecs[si] = acc_vecs[si] + vec_c + + # ── Store results ── + if const_expr(n_sub > 1): + y_row = Y_buf[tok_i32, None] + y_tiled = fx.logical_divide(y_row, fx.make_layout(tile_cols, 1)) + y_div = fx.logical_divide(y_tiled[None, tile_i32], fx.make_layout(VEC_WIDTH, 1)) + y_inner = fx.logical_divide(y_div[None, tid_i32], fx.make_layout(copy_vec_width, 1)) + + for si in range_constexpr(n_sub): + out_vec = acc_vecs[si] + if const_expr(elem_bits < 32): + out_vec = out_vec.truncf(vec_type_e) + + if const_expr(n_sub > 1): + dst = y_inner[None, fx.Int32(si)] + else: + y_row = Y_buf[tok_i32, None] + y_tiled = fx.logical_divide(y_row, fx.make_layout(tile_cols, 1)) + y_div = fx.logical_divide(y_tiled[None, tile_i32], fx.make_layout(VEC_WIDTH, 1)) + dst = y_div[None, tid_i32] + + r_out = fx.make_rmem_tensor(copy_vec_width, elem_dtype) + fx.memref_store_vec(out_vec, r_out) + fx.copy_atom_call(copy_atom, r_out, dst) + + with _if_else(_if_full): + for lane in range_constexpr(VEC_WIDTH): + col = col_base + fx.Index(lane) + lane_ok = col < c_model_dim + _if_lane = scf.IfOp(lane_ok) + with _if_then(_if_lane): + a = arith.constant(0.0, type=compute_type()) + token_base = token_idx * c_topk + for k in range_constexpr(topk): + k_idx = fx.Index(k) + x_idx_i32 = fx.Int32((token_base + k_idx) * c_model_dim + col) + if const_expr(use_mask): + m_idx_i32 = fx.Int32(token_base + k_idx) + mv = buffer_ops.buffer_load(mask_rsrc, m_idx_i32, vec_width=1, dtype=i8_type()) + v = (mv != fx.Int8(0)).select( + buffer_ops.buffer_load(x_rsrc, x_idx_i32, vec_width=1, dtype=elem_type()), + arith.constant(0.0, type=elem_type()), + ) + else: + v = buffer_ops.buffer_load(x_rsrc, x_idx_i32, vec_width=1, dtype=elem_type()) + if const_expr(dtype_str in ("f16", "bf16")): + v = v.extf(compute_type()) + a = a + v + + out = a + if const_expr(dtype_str in ("f16", "bf16")): + out = out.truncf(elem_type()) + y_idx_i32 = fx.Int32(token_idx * c_model_dim + col) + buffer_ops.buffer_store(out, y_rsrc, y_idx_i32) + + # ── Host launcher (flyc.jit + .launch) ──────────────────────────────── + tile_size = BLOCK_SIZE * VEC_WIDTH + gy_static = (model_dim + tile_size - 1) // tile_size + + @flyc.jit + def launch_moe_reduction( + X: fx.Tensor, + Y: fx.Tensor, + valid_mask: fx.Tensor, + i32_m_tokens: fx.Int32, + stream: fx.Stream, + ): + gx = fx.Index(i32_m_tokens) + moe_reduction_kernel(X, Y, valid_mask, i32_m_tokens).launch( + grid=(gx, gy_static, 1), + block=(BLOCK_SIZE, 1, 1), + stream=stream, + ) + + return launch_moe_reduction + + +# MoE GEMM2 Execution Modes +class MoeGemm2Mode: + """Execution mode for MoE GEMM2.""" + + ATOMIC = "atomic" # Use atomic accumulation (default) + REDUCE = "reduce" # Use non-atomic write + reduce kernel + + +class _MoeGemm2ReduceWrapper: + """Wrapper combining GEMM2 (no atomics) with reduction kernel. + + This wrapper handles the intermediate buffer allocation and orchestrates + the two-phase computation: + 1. GEMM2 outputs to [tokens*topk, model_dim] without atomics + 2. Reduce sums over topk to produce [tokens, model_dim] + """ + + def __init__( + self, + gemm2_exe, + reduce_exe, + topk: int, + model_dim: int, + out_dtype_str: str = "f16", + use_mask: bool = False, + ): + self._gemm2_exe = gemm2_exe + self._reduce_exe = reduce_exe + self._topk = topk + self._model_dim = model_dim + self._out_dtype_str = out_dtype_str + self._use_mask = use_mask + + def _get_torch_dtype(self): + """Convert dtype string to torch dtype.""" + import torch + + dtype_map = { + "f16": torch.float16, + "fp16": torch.float16, + "bf16": torch.bfloat16, + "f32": torch.float32, + } + return dtype_map.get(self._out_dtype_str, torch.float16) + + def __call__( + self, + arg_out, + arg_x, + arg_w, + arg_scale_x, + arg_scale_w, + arg_sorted_token_ids, + arg_expert_ids, + arg_sorted_weights, + arg_num_valid_ids, + tokens_in, + n_in, + k_in, + size_expert_ids_in, + valid_mask=None, + stream=None, + ): + """Execute GEMM2 + reduce. + + Args match moe_gemm2 kernel signature (see compile_moe_blockscale_gemm2). + """ + import torch + + if stream is None: + stream = torch.cuda.current_stream() + intermediate = torch.empty( + tokens_in * self._topk, self._model_dim, device=arg_out.device, dtype=self._get_torch_dtype() + ) + if not self._use_mask: + intermediate.zero_() + # Phase 1: GEMM2 (no atomics) -> [tokens*topk, model_dim] + self._gemm2_exe( + intermediate.view(-1), + arg_x, + arg_w, + arg_scale_x, + arg_scale_w, + arg_sorted_token_ids, + arg_expert_ids, + arg_sorted_weights, + arg_num_valid_ids, + tokens_in, + n_in, + k_in, + size_expert_ids_in, + stream, + ) + # Phase 2: Reduce over topk -> [tokens, model_dim] + X = intermediate.view(tokens_in, self._topk, self._model_dim) + Y = arg_out.view(tokens_in, self._model_dim) + if not self._use_mask: + if valid_mask is not None: + logging.warning("valid_mask provided but use_mask=False; ignoring valid_mask") + valid_mask = torch.empty((0, self._topk), device=arg_out.device, dtype=torch.uint8) + self._reduce_exe(X, Y, valid_mask, tokens_in, stream) + + @property + def mode(self) -> str: + """Return the execution mode.""" + return MoeGemm2Mode.REDUCE + + +def compile_moe_blockscale_gemm2_ex( + *, + model_dim: int, + inter_dim: int, + experts: int, + topk: int, + tile_m: int, + tile_n: int, + tile_k: int, + doweight_stage2: bool, + in_dtype: str = "fp8", + out_dtype: str = "f16", + use_cshuffle_epilog: bool | None = None, + # Extended parameters for mode control + mode: str = MoeGemm2Mode.ATOMIC, + valid_mask=None, +): + """Compile MoE GEMM2 kernel with optional reduction. + + This is the extended interface that supports explicit mode control. + + Args: + mode: Execution mode selection: + - "atomic": Use atomic accumulation (original behavior) + - "reduce": Use non-atomic write + reduce kernel + + Returns: + Compiled executable (either wrapped or raw depending on mode). + """ + # Compile based on mode + if mode == MoeGemm2Mode.REDUCE: + # Determine if we need masked reduction + use_mask = valid_mask is not None + + # Compile GEMM2 with accumulate=False. + # NOTE: blockscale gemm2 is FP8-only, so the legacy `in_dtype` kwarg from `_ex` + # is ignored (blockscale doesn't accept it as a formal parameter). + gemm2_exe = compile_moe_blockscale_gemm2( + model_dim=model_dim, + inter_dim=inter_dim, + experts=experts, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage2=doweight_stage2, + out_dtype=out_dtype, + use_cshuffle_epilog=use_cshuffle_epilog, + accumulate=False, + ) + # Compile reduction kernel with masking support + out_s = str(out_dtype).strip().lower() + if out_s in ("f16", "fp16", "half"): + dtype_str = "f16" + elif out_s in ("bf16", "bfloat16"): + dtype_str = "bf16" + else: + dtype_str = "f32" + reduce_exe = compile_moe_reduction( + topk=topk, + model_dim=model_dim, + dtype_str=dtype_str, + use_mask=use_mask, + ) + return _MoeGemm2ReduceWrapper( + gemm2_exe=gemm2_exe, + reduce_exe=reduce_exe, + topk=topk, + model_dim=model_dim, + out_dtype_str=dtype_str, + use_mask=use_mask, + ) + else: + # Compile GEMM2 with accumulate=True (atomic mode). + # See note above: blockscale gemm2 is FP8-only; `in_dtype` kwarg is ignored. + return compile_moe_blockscale_gemm2( + model_dim=model_dim, + inter_dim=inter_dim, + experts=experts, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage2=doweight_stage2, + out_dtype=out_dtype, + use_cshuffle_epilog=use_cshuffle_epilog, + accumulate=True, + ) diff --git a/aiter/ops/flydsl/moe_kernels.py b/aiter/ops/flydsl/moe_kernels.py index 719bdcae22..56c486d590 100644 --- a/aiter/ops/flydsl/moe_kernels.py +++ b/aiter/ops/flydsl/moe_kernels.py @@ -32,9 +32,16 @@ def flydsl_kernel_name( tile_k: int, mode: str = "", sort_block_m: int = 0, + quant_type: str = "", ) -> str: - """Construct kernel name: ``flydsl_moe{stage}_a{a}_w{b}_{out}_t{M}x{N}x{K}[_{mode}][_sbm{S}]``.""" + """Construct kernel name: ``flydsl_moe{stage}_a{a}_w{b}_{out}_t{M}x{N}x{K}[_{qtag}][_{mode}][_sbm{S}]``. + + quant_type: optional quantization type tag (e.g. "per_1x128" → "qbs" for blockscale). + """ name = f"flydsl_moe{stage}_a{a_dtype}_w{b_dtype}_{out_dtype}_t{tile_m}x{tile_n}x{tile_k}" + qtag = _quant_type_tag(quant_type) + if qtag: + name += f"_{qtag}" if mode: name += f"_{mode}" if sort_block_m > 0 and sort_block_m != tile_m: @@ -42,6 +49,16 @@ def flydsl_kernel_name( return name +def _quant_type_tag(quant_type: str) -> str: + """Map quant_type string to short kernel-name tag. Empty for unspecified.""" + if not quant_type: + return "" + q = quant_type.lower() + if q == "per_1x128" or q == "per_128x128": + return "qbs" # block-scale (FP8 a8w8 blockscale MoE) + return "" + + def get_flydsl_kernel_params(name: str) -> Optional[Dict]: """Lookup kernel params by name. @@ -261,6 +278,103 @@ def get_flydsl_stage2_kernels_int4_bf16(out_dtype: str) -> Dict[str, Dict]: return kernels +def get_flydsl_stage1_kernels_blockscale(out_dtype: str) -> Dict[str, Dict]: + """Return {kernelName: params} for FP8 a8w8 blockscale stage1 configs. + + Routed by triple (a_dtype="fp8", b_dtype="fp8", quant_type="per_1x128"). + Search space focuses on tile sizes compatible with ScaleBlockN=128. + tile_m=16 is included so token<=8 shapes can pack tighter waves with + split-K and beat ASM 1-stage variants on very small M. + """ + kernels = {} + a_dtype = "fp8" + b_dtype = "fp8" + quant_type = "per_1x128" + tile_ks = [128, 256] + tile_ms = [16, 32, 64, 128] + tile_ns = [128, 256] + k_batches = [1, 2, 3, 4, 5, 6, 7, 8] + waves_per_eus = [2, 3, 4] + + for tm in tile_ms: + for tn in tile_ns: + for tk in tile_ks: + for kb in k_batches: + for wpe in waves_per_eus: + name = flydsl_kernel_name( + 1, a_dtype, b_dtype, out_dtype, tm, tn, tk, + quant_type=quant_type, + ) + if kb != 1: + name += f"_kb{kb}" + if wpe != 2: + name += f"_w{wpe}" + kernels[name] = { + "stage": 1, + "a_dtype": a_dtype, + "b_dtype": b_dtype, + "out_dtype": out_dtype, + "tile_m": tm, + "tile_n": tn, + "tile_k": tk, + "MPerBlock": tm, + "k_batch": kb, + "waves_per_eu": wpe, + "quant_type": quant_type, + } + return kernels + + +def get_flydsl_stage2_kernels_blockscale(out_dtype: str) -> Dict[str, Dict]: + """Return {kernelName: params} for FP8 a8w8 blockscale stage2 configs. + + Stage2 has no split-K (atomic-accumulates into out directly), so we only + sweep tile sizes and waves_per_eu. tile_m=16 helps tiny-batch tokens that + don't fill a tile_m=32 wave; tile_n=256 cuts the model_dim n-tile count in + half (model_dim=7168 -> 28 instead of 56 tiles). + """ + kernels = {} + a_dtype = "fp8" + b_dtype = "fp8" + quant_type = "per_1x128" + tile_ks = [128, 256] + tile_ms = [16, 32, 64, 128] + tile_ns = [128, 256] + waves_per_eus = [2, 3] + + # Two accumulation modes: + # "atomic": bf16 atomic_add into out — cheap memory, contention scales with M. + # "reduce": write into a [token, topk, model_dim] staging buffer (no atomics), + # then torch.sum across topk. Wins on large tokens where atomic + # contention dominates stage2 latency. + modes = ("atomic", "reduce") + for tm in tile_ms: + for tn in tile_ns: + for tk in tile_ks: + for wpe in waves_per_eus: + for mode in modes: + name = flydsl_kernel_name( + 2, a_dtype, b_dtype, out_dtype, tm, tn, tk, mode, + quant_type=quant_type, + ) + if wpe != 2: + name += f"_w{wpe}" + kernels[name] = { + "stage": 2, + "a_dtype": a_dtype, + "b_dtype": b_dtype, + "out_dtype": out_dtype, + "tile_m": tm, + "tile_n": tn, + "tile_k": tk, + "mode": mode, + "MPerBlock": tm, + "waves_per_eu": wpe, + "quant_type": quant_type, + } + return kernels + + def _register_all_configs(): """Pre-populate _KERNEL_PARAMS with all supported configs at import time.""" for a in ("fp8", "fp4", "fp16"): @@ -272,6 +386,10 @@ def _register_all_configs(): for out in ("bf16", "f16"): _KERNEL_PARAMS.update(get_flydsl_stage1_kernels_int4_bf16(out)) _KERNEL_PARAMS.update(get_flydsl_stage2_kernels_int4_bf16(out)) + # FP8 a8w8 blockscale (per_1x128/per_128x128) configs. + for out in ("bf16", "f16"): + _KERNEL_PARAMS.update(get_flydsl_stage1_kernels_blockscale(out)) + _KERNEL_PARAMS.update(get_flydsl_stage2_kernels_blockscale(out)) _register_all_configs() @@ -302,8 +420,35 @@ def compile_flydsl_moe_stage1( a_scale_one: bool = False, xcd_swizzle: int = 0, swiglu_limit: float = 0.0, + quant_type: str = "", ): - """Compile stage1 kernel (cached via underlying lru_cache).""" + """Compile stage1 kernel (cached via underlying lru_cache). + + quant_type: optional quantization tag. When (a_dtype, b_dtype, quant_type) + == ("fp8", "fp8", "per_1x128") route to the blockscale stage1 kernel. + """ + # FP8 a8w8 blockscale MoE (per-128x128 weight blocks + per-token-group-128 activation). + if ( + a_dtype == "fp8" + and b_dtype == "fp8" + and quant_type.lower() in ("per_1x128", "per_128x128") + ): + from .kernels.moe_blockscale_2stage import compile_moe_blockscale_gemm1 + + return compile_moe_blockscale_gemm1( + model_dim=model_dim, + inter_dim=inter_dim, + experts=experts, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage1=doweight_stage1, + scale_block_k=128, + out_dtype=out_dtype, + waves_per_eu=waves_per_eu, + k_batch=k_batch, + ) if b_dtype == "fp4": from .kernels.mixed_moe_gemm_2stage import compile_mixed_moe_gemm1 from .moe_common import GateMode @@ -383,8 +528,36 @@ def compile_flydsl_moe_stage2( inter_dim_pad: int = 0, xcd_swizzle: int = 0, enable_bias: bool = False, + waves_per_eu: int = 2, + quant_type: str = "", ): - """Compile stage2 kernel (cached via underlying lru_cache).""" + """Compile stage2 kernel (cached via underlying lru_cache). + + quant_type: optional quantization tag. When (a_dtype, b_dtype, quant_type) + == ("fp8", "fp8", "per_1x128") route to the blockscale stage2 kernel. + """ + # FP8 a8w8 blockscale MoE (matches stage1 dispatch). + if ( + a_dtype == "fp8" + and b_dtype == "fp8" + and quant_type.lower() in ("per_1x128", "per_128x128") + ): + from .kernels.moe_blockscale_2stage import compile_moe_blockscale_gemm2 + + return compile_moe_blockscale_gemm2( + model_dim=model_dim, + inter_dim=inter_dim, + experts=experts, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage2=doweight_stage2, + scale_block_k=128, + out_dtype=out_dtype, + accumulate=accumulate, + waves_per_eu=waves_per_eu, + ) if b_dtype == "fp4": from .kernels.mixed_moe_gemm_2stage import compile_mixed_moe_gemm2 @@ -661,6 +834,170 @@ def _get_compiled_swiglu(inter_dim: int): return build_swiglu_and_mul_module(inter_dim) +# ──────────────────────────────────────────────────────────────────────── +# FP8 a8w8 blockscale path (per_1x128/per_128x128) — internal helpers. +# Routed by the (a_dtype, b_dtype, quant_type) triplet in `flydsl_moe_stage1/2`. +# ──────────────────────────────────────────────────────────────────────── + + +def _flydsl_moe_stage1_blockscale( + *, + a: torch.Tensor, + w1: torch.Tensor, + sorted_token_ids: torch.Tensor, + sorted_expert_ids: torch.Tensor, + num_valid_ids: torch.Tensor, + out: Optional[torch.Tensor], + topk: int, + tile_m: int, + tile_n: int, + tile_k: int, + out_dtype: str, + w1_scale: Optional[torch.Tensor], + a1_scale: Optional[torch.Tensor], + sorted_weights: Optional[torch.Tensor], + k_batch: int, + waves_per_eu: int, + E: int, + inter_dim: int, + model_dim: int, + token_num: int, +) -> torch.Tensor: + """FP8 a8w8 blockscale stage1. + + Inputs (caller pre-quantizes): + a: [token_num, model_dim] fp8 (quantized) + a1_scale: [nblk_k, token_num] f32 (transposed; flat view OK) + w1: [E, 2*inter_dim, model_dim] fp8 (preshuffled, same layout as + `FlyDSL.kernels.moe_blockscale_2stage` expects) + w1_scale: [E, 2*inter_dim/128, nblk_k] f32 (flat view OK) + + Output: + out: [token_num, topk, inter_dim] f16/bf16 (silu(gate) * up applied). + + Split-K: when k_batch > 1, an internal [tokens, topk, 2*inter] bf16 buffer + accumulates gate/up partials via bf16 atomic add; `silu_and_mul` then writes + the final output (post-silu). + """ + if a1_scale is None or w1_scale is None: + raise ValueError("blockscale stage1 requires a1_scale and w1_scale") + if out_dtype not in ("f16", "fp16", "bf16"): + raise ValueError(f"blockscale stage1 only supports f16/bf16, got {out_dtype!r}") + + dev = a.device + dtypes = _get_dtypes() + torch_out_dtype = dtypes.bf16 if out_dtype == "bf16" else dtypes.fp16 + + flat_a_scale = a1_scale.view(-1) + flat_w_scale = w1_scale.view(-1) + sw = ( + sorted_weights + if sorted_weights is not None + else torch.empty(0, device=dev, dtype=torch.float32) + ) + + size_expert_ids = sorted_expert_ids.shape[0] + stream = torch.cuda.current_stream() + _is_splitk = k_batch > 1 + + if _is_splitk: + # Split-K: stage1 writes f16/bf16 partials via atomic add into a zeroed + # `[token, topk, 2*inter]` buffer; silu_and_mul then writes the final + # output. Partials dtype = out_dtype so silu_and_mul reduces in-place + # (silu_and_mul requires input/output dtype match). + if out is None: + out = torch.empty( + (token_num, topk, inter_dim), dtype=torch_out_dtype, device=dev + ) + elif out.dtype != torch_out_dtype: + raise ValueError( + f"blockscale stage1 split-K requires out.dtype={torch_out_dtype}, " + f"got out.dtype={out.dtype} (caller should match out_dtype={out_dtype!r})" + ) + + tmp_out = torch.zeros( + (token_num, topk, inter_dim * 2), dtype=torch_out_dtype, device=dev + ) + _splitk_compile_out_dtype = "bf16" if out_dtype == "bf16" else "f16" + exe = compile_flydsl_moe_stage1( + model_dim=model_dim, + inter_dim=inter_dim, + experts=E, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage1=False, + a_dtype="fp8", + b_dtype="fp8", + out_dtype=_splitk_compile_out_dtype, + k_batch=k_batch, + waves_per_eu=waves_per_eu, + quant_type="per_1x128", + ) + exe( + tmp_out.view(-1), + a.view(-1), + w1, + flat_a_scale, + flat_w_scale, + sorted_token_ids, + sorted_expert_ids, + sw, + num_valid_ids, + token_num, + inter_dim, + model_dim, + size_expert_ids, + stream, + ) + + from aiter.ops.activation import silu_and_mul + + silu_and_mul(out.view(-1, inter_dim), tmp_out.view(-1, inter_dim * 2)) + return out + + if out is None: + out = torch.empty( + (token_num, topk, inter_dim), dtype=torch_out_dtype, device=dev + ) + + # Non-splitk: kernel fuses silu(gate)*up directly into `out`. + exe = compile_flydsl_moe_stage1( + model_dim=model_dim, + inter_dim=inter_dim, + experts=E, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + doweight_stage1=sorted_weights is not None, + a_dtype="fp8", + b_dtype="fp8", + out_dtype=out_dtype if out_dtype != "fp16" else "f16", + k_batch=1, + waves_per_eu=waves_per_eu, + quant_type="per_1x128", + ) + exe( + out.view(-1), + a.view(-1), + w1, + flat_a_scale, + flat_w_scale, + sorted_token_ids, + sorted_expert_ids, + sw, + num_valid_ids, + token_num, + inter_dim, + model_dim, + size_expert_ids, + stream, + ) + return out + + # Public API @@ -696,6 +1033,7 @@ def flydsl_moe_stage1( a_scale_one: bool = False, xcd_swizzle: int = 0, swiglu_limit: float = 0.0, + quant_type: str = "", ): """Fused gate+up GEMM (MOE stage1). @@ -725,6 +1063,36 @@ def flydsl_moe_stage1( if a_dtype == "fp4": model_dim = model_dim * 2 + # ── FP8 a8w8 blockscale path (per_1x128/per_128x128) ────────────────── + # Dispatch on the (a_dtype, b_dtype, quant_type) triplet; no new dtype string. + if ( + a_dtype == "fp8" + and b_dtype == "fp8" + and quant_type.lower() in ("per_1x128", "per_128x128") + ): + return _flydsl_moe_stage1_blockscale( + a=a, + w1=w1, + sorted_token_ids=sorted_token_ids, + sorted_expert_ids=sorted_expert_ids, + num_valid_ids=num_valid_ids, + out=out, + topk=topk, + tile_m=tile_m, + tile_n=tile_n, + tile_k=tile_k, + out_dtype=out_dtype, + w1_scale=w1_scale, + a1_scale=a1_scale, + sorted_weights=sorted_weights, + k_batch=k_batch, + waves_per_eu=waves_per_eu, + E=E, + inter_dim=inter_dim, + model_dim=model_dim, + token_num=token_num, + ) + _need_fp4 = out_dtype == "fp4" _need_fp8 = out_dtype == "fp8" _fuse_any_quant = _need_fp4 or _need_fp8 @@ -1040,6 +1408,8 @@ def flydsl_moe_stage2( inter_dim_pad: int = 0, xcd_swizzle: int = 0, bias: Optional[torch.Tensor] = None, + waves_per_eu: int = 2, + quant_type: str = "", ) -> torch.Tensor: """Down-projection GEMM (MOE stage2). Supports atomic/reduce modes. @@ -1064,6 +1434,19 @@ def flydsl_moe_stage2( if a_dtype == "fp4": inter_dim = inter_dim * 2 + _is_blockscale = ( + a_dtype == "fp8" + and b_dtype == "fp8" + and quant_type.lower() in ("per_1x128", "per_128x128") + ) + if _is_blockscale: + if a2_scale is None or w2_scale is None: + raise ValueError("blockscale stage2 requires a2_scale and w2_scale") + if out_dtype not in ("f16", "fp16", "bf16"): + raise ValueError( + f"blockscale stage2 only supports f16/bf16, got {out_dtype!r}" + ) + torch_out_dtype = torch.bfloat16 if out_dtype == "bf16" else torch.float16 if out is None: alloc_fn = torch.zeros if accumulate else torch.empty @@ -1171,6 +1554,8 @@ def flydsl_moe_stage2( inter_dim_pad=inter_dim_pad, xcd_swizzle=xcd_swizzle, enable_bias=(bias is not None), + waves_per_eu=waves_per_eu, + quant_type=quant_type, ) _run_compiled(exe, args) diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index f2a40f9c92..54b51b5130 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -55,6 +55,8 @@ get_flydsl_stage2_kernels, get_flydsl_stage1_kernels_int4_bf16, get_flydsl_stage2_kernels_int4_bf16, + get_flydsl_stage1_kernels_blockscale, + get_flydsl_stage2_kernels_blockscale, flydsl_moe_stage1, flydsl_moe_stage2, ) @@ -457,6 +459,7 @@ def run_flydsl_stage1_out( a_scale_one=a_scale_one, xcd_swizzle=kparams.get("xcd_swizzle", 0), bias=bias, + quant_type=kparams.get("quant_type", ""), ) if isinstance(result, tuple): out_raw = result[0] @@ -516,6 +519,8 @@ def run_flydsl_stage2_out( b_nt=kparams.get("b_nt", 0), xcd_swizzle=kparams.get("xcd_swizzle", 0), bias=bias, + waves_per_eu=kparams.get("waves_per_eu", 2), + quant_type=kparams.get("quant_type", ""), ) @staticmethod @@ -1341,6 +1346,50 @@ def run_torch_moe_stage1( ref1 = ref1.view(ref1.shape[0], topk, -1) return ref1 + @staticmethod + def run_torch_moe_stage1_blockscale_bf16( + a1_qt, + w1_qt, + w2_qt, + topk_weights, + topk_ids, + a1_scale, + w1_scale, + sorted_ids=None, + num_valid_ids=None, + w1_bias=None, + dtype=dtypes.bf16, + activation=ActivationType.Silu, + quant_type=QuantType.per_1x128, + doweight_stage1=False, + topk=1, + blockM=32, + ): + """BF16-output reference for a8w8 per_1x128 stage1 tuning. + + ``run_torch_moe_stage1`` quantizes the per_1x128 path output back to fp8 + to match CK/ASM stage1 outputs. That post-quant is sensitive to small + bf16 GEMM noise (each 1x128 block's amax shifts → scale shifts → fp8 + codes shift), producing huge spurious mismatch when comparing the fp8 + bytes — even though decoded values are within ~0.1 typical noise. + For FlyDSL stage1 tuner candidates we compare in bf16 to stay close to + true numerical accuracy. + """ + return torch_moe_stage1( + a1_qt, + w1_qt, + w2_qt, + topk_weights, + topk_ids, + activation=activation, + quant_type=quant_type, + dtype=dtype, + a1_scale=a1_scale, + w1_scale=w1_scale, + w1_bias=w1_bias, + doweight=doweight_stage1, + ) + @staticmethod def run_torch_moe_stage2( a2_qt, @@ -2130,6 +2179,20 @@ def gen_2stages_task(self, key, blockMs): if q_type == QuantType.per_1x32 and q_dtype_w == dtypes.i4x2: return tasks_ck + # The CK2stages a8w8 blockscale templates currently fail to build + # against the in-container ROCm/CK headers (device_moe_gemm_blockscale.hpp + # template signature mismatch). Repeated rebuild attempts also cause + # the worker pool to stall on the build baton lock. Skip CK candidates + # for fp8 blockscale and rely on ASM + FlyDSL until the CK side is + # re-aligned. Set AITER_FMOE_TUNE_INCLUDE_CK_BLOCKSCALE=1 to override. + if ( + q_type == QuantType.per_1x128 + and q_dtype_a == dtypes.fp8 + and q_dtype_w == dtypes.fp8 + and os.environ.get("AITER_FMOE_TUNE_INCLUDE_CK_BLOCKSCALE", "0") != "1" + ): + return tasks_ck + # CK2stages codegen does not support SwiGLU activation. GPT-OSS MXFP4 # cases are covered by FlyDSL (or the a8w4 CK-Tile path above). if ( @@ -2812,6 +2875,187 @@ def gen_flydsl_i4_2stages_task(self, info, blockMs): return tasks_flydsl + def gen_flydsl_blockscale_2stages_task(self, info, blockMs): + """Generate FlyDSL tasks for FP8 a8w8 blockscale (per_1x128) MoE. + + Filter: q_type == per_1x128 AND q_dtype_a == fp8 AND q_dtype_w == fp8. + Routes through (a_dtype, b_dtype, quant_type) = (fp8, fp8, per_1x128). + """ + tasks_flydsl = [] + if not is_flydsl_available(): + return tasks_flydsl + ( + cu_num, + token, + model_dim, + inter_dim, + expert, + topk, + act_type, + dtype, + q_dtype_a, + q_dtype_w, + q_type, + use_g1u1, + doweight_stage1, + ) = info + + if not ( + q_type == QuantType.per_1x128 + and q_dtype_a == dtypes.fp8 + and q_dtype_w == dtypes.fp8 + ): + return tasks_flydsl + + out_dtype_str = "bf16" if dtype == dtypes.bf16 else "f16" + a_dtype_str = "fp8" + + flydsl_s1_kernels = get_flydsl_stage1_kernels_blockscale(out_dtype_str) + flydsl_s2_kernels = get_flydsl_stage2_kernels_blockscale(out_dtype_str) + + for blockM in blockMs: + if blockM not in [16, 32, 64, 128] or not use_g1u1: + continue + + for kname, kparams in flydsl_s1_kernels.items(): + ktm = kparams["tile_m"] + if ktm != blockM: + continue + # split-K compatibility (mirrors moe_blockscale kernel validation). + kb = kparams.get("k_batch", 1) + if kb > 1: + if model_dim % kb != 0: + continue + k_per_batch = model_dim // kb + if k_per_batch % kparams["tile_k"] != 0: + continue + k_tiles = k_per_batch // kparams["tile_k"] + if k_tiles < 4 or k_tiles % 2 != 0: + continue + if k_per_batch % 128 != 0: # scale_block_k + continue + + ref_args_extra = ( + [0, 10, 11, 12, 13, 3, 4, 5, 8, 22], + dtype, + act_type, + q_type, + doweight_stage1, + topk, + blockM, + ) + tasks_flydsl.append( + ( + (info, "stage1", kname, blockM), + FmoeTuner.generate_data_2stages, + ( + token, + model_dim, + inter_dim, + expert, + topk, + act_type, + dtype, + q_dtype_a, + q_dtype_w, + q_type, + use_g1u1, + doweight_stage1, + blockM, + 1, + ), + FmoeTuner.run_flydsl_stage1_out, + ( + [0, 16, 5, 6, 7, 8, 18, 14, 22], + dtype, + topk, + kparams, + blockM, + q_dtype_a, + q_type, + act_type, + ), + {}, + FmoeTuner.run_torch_moe_stage1_blockscale_bf16, + ref_args_extra, + {}, + (None), + # bf16 tol: FlyDSL stage1 noise vs torch.fp32 ref is + # bounded; atol 0.5 / rtol 0.1 leaves headroom for the + # ~0.1 max-elem and ~0.07 mean errors observed. + 0.1, + 0.5, + True, + ) + ) + + # Reduce-mode candidates trade contention-free GEMM writes for an + # extra zero+sum pass on a (token, topk, model_dim) staging buffer. + # Whether reduce beats atomic depends on (token, topk, model_dim); + # we keep the candidates available so tuning can pick the winner + # per shape, but allow opting out via env var for tuning speed. + _ENABLE_REDUCE = ( + os.environ.get("AITER_FLYDSL_STAGE2_REDUCE", "1") not in ("0", "false", "False") + ) + for kname, kparams in flydsl_s2_kernels.items(): + s2_tile_m = kparams["tile_m"] + if s2_tile_m != blockM: + continue + if (not _ENABLE_REDUCE) and kparams.get("mode", "atomic") == "reduce": + continue + s2_kparams = {**kparams, "sort_block_m": blockM} + + s2_ref_args = ( + [0, 10, 11, 12, 13, 3, 4, 22], + dtype, + q_type, + doweight_stage1, + ) + tasks_flydsl.append( + ( + (info, "stage2", kname, blockM), + FmoeTuner.generate_data_2stages, + ( + token, + model_dim, + inter_dim, + expert, + topk, + act_type, + dtype, + q_dtype_a, + q_dtype_w, + q_type, + use_g1u1, + doweight_stage1, + blockM, + 2, + ), + FmoeTuner.run_flydsl_stage2_out, + ( + [0, 17, 5, 6, 7, 8, 19, 14, 9, 22], + dtype, + topk, + s2_kparams, + blockM, + q_type, + act_type, + ), + {}, + FmoeTuner.run_torch_moe_stage2, + s2_ref_args, + {}, + (None), + # Same rationale as stage1 — cascading fp8 quant chain + # adds another quant->dequant step relative to torch ref. + 0.15, + 0.5, + None, + ) + ) + + return tasks_flydsl + def run_config(self, args): from aiter.fused_moe import fused_moe, fused_topk from aiter.test_common import run_perftest, checkAllclose @@ -3106,6 +3350,7 @@ def tune( tasks_ck.extend(self.gen_2stages_task(info, blockMs)) tasks_ck.extend(self.gen_flydsl_2stages_task(info, blockMs)) tasks_ck.extend(self.gen_flydsl_i4_2stages_task(info, blockMs)) + tasks_ck.extend(self.gen_flydsl_blockscale_2stages_task(info, blockMs)) task_1stage.extend(self.gen_1stage_asm_task(info)) if tasks is None and tasks_ck is None and task_1stage is None: print("no moe solution can tune for ", line)