From 877c200d51e6f69bac90884086bb8f6ecfa03fb5 Mon Sep 17 00:00:00 2001 From: "Li, Tingqian" Date: Fri, 22 May 2026 02:39:47 +0000 Subject: [PATCH 1/2] add asmjit AOT kernels for qwen35/Hunyuan3 --- .../hunyuan3_fp8_per_tensor_tuned_fmoe.csv | 19 + .../hunyuan3_fp8_per_tensor_untuned_fmoe.csv | 19 + .../qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv | 14 + .../qwen3_5_397b_fp8_ptpc_untuned_fmoe.csv | 19 + aiter/fused_moe.py | 31 ++ aiter/fused_moe_asmjit_aot.py | 338 ++++++++++++++++++ aiter/utility/base_tuner.py | 6 + .../gemm_moe_tune.py | 244 ++++++++++++- csrc/cpp_itfs/hsaco_tools.py | 261 ++++++++++++++ ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 0 -> 19384 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 0 -> 11800 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 0 -> 20408 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 0 -> 12176 bytes ...LOCK_N=1024-atomic_write=False-STAGES=3.co | Bin 0 -> 13624 bytes ...BLOCK_N=1024-atomic_write=True-STAGES=3.co | Bin 0 -> 13944 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 0 -> 18136 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 0 -> 11272 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 0 -> 18056 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 0 -> 11264 bytes ...TILE_SIZE_N=64-quant_type_str=per_Token.co | Bin 0 -> 8456 bytes ...ILE_SIZE_N=64-quant_type_str=per_Tensor.co | Bin 0 -> 9992 bytes ...ith_silu=True-quant_type_str=per_Tensor.co | Bin 0 -> 9160 bytes ...with_silu=True-quant_type_str=per_Token.co | Bin 0 -> 9160 bytes ...th_silu=False-quant_type_str=per_Tensor.co | Bin 0 -> 8640 bytes ...ith_silu=False-quant_type_str=per_Token.co | Bin 0 -> 8712 bytes ...ith_silu=True-quant_type_str=per_Tensor.co | Bin 0 -> 8640 bytes ...with_silu=True-quant_type_str=per_Token.co | Bin 0 -> 8712 bytes ..._gemm_final_reduce_bf16-TOPK=10-OC=4096.co | Bin 0 -> 14544 bytes ...e_gemm_final_reduce_bf16-TOPK=8-OC=4096.co | Bin 0 -> 13008 bytes 29 files changed, 948 insertions(+), 3 deletions(-) create mode 100644 aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv create mode 100644 aiter/configs/model_configs/hunyuan3_fp8_per_tensor_untuned_fmoe.csv create mode 100644 aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv create mode 100644 aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_untuned_fmoe.csv create mode 100644 aiter/fused_moe_asmjit_aot.py create mode 100644 csrc/cpp_itfs/hsaco_tools.py create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=False-STAGES=3.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_splitk-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=64-quant_type_str=per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_splitk-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=64-quant_type_str=per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_final_reduce_bf16-TOPK=10-OC=4096.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_gemm_final_reduce_bf16-TOPK=8-OC=4096.co diff --git a/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv new file mode 100644 index 0000000000..9b46beaee0 --- /dev/null +++ b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv @@ -0,0 +1,19 @@ +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag +80,1,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,15.358300000000042,fused_moe_asmjit_aot__16_True_False,5.93%,0.0,,0%,15.358300000000042,1,0,2.46,29495.26, +80,2,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,30.061942528735862,fused_moe_asmjit_aot__16_True_False,4.43%,0.0,,0%,30.061942528735862,1,0,2.51,15069.2, +80,4,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,42.2820199999999,fused_moe_asmjit_aot__16_True_False,3.32%,0.0,,0%,42.2820199999999,1,0,3.57,10714.58, +80,8,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,62.29127999999991,fused_moe_asmjit_aot__16_True_False,3.73%,0.0,,0%,62.29127999999991,1,0,4.85,7273.62, +80,16,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,94.30098999999973,fused_moe_asmjit_aot__16_True_False,3.65%,0.0,,0%,94.30098999999973,1,0,6.4,4805.69, +80,32,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,131.00703999999988,fused_moe_asmjit_aot__16_True_False,3.47%,0.0,,0%,131.00703999999988,1,0,9.22,3460.72, +80,64,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,226.26340000000016,fused_moe_asmjit_aot__64_True_True,2.21%,0.0,,0%,226.26340000000016,1,0,10.68,2005.5, +80,128,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,242.56287755102025,fused_moe_asmjit_aot__64_True_True,2.52%,0.0,,0%,242.56287755102025,1,0,19.92,1873.98, +80,256,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,249.7409595959585,fused_moe_asmjit_aot__64_True_True,2.73%,0.0,,0%,249.7409595959585,1,0,38.69,1826.41, +80,512,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,268.0743711340202,fused_moe_asmjit_aot__64_True_True,2.53%,0.0,,0%,268.0743711340202,1,0,72.1,1713.24, +80,1024,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,310.66775000000007,fused_moe_asmjit_aot__64_True_True,2.35%,0.0,,0%,310.66775000000007,1,0,124.42,1498.6, +80,2048,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,564.0819578947369,fused_moe_asmjit_aot__64_True_True,2.04%,0.0,,0%,564.0819578947369,1,0,137.05,847.66, +80,4096,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,906.8664583333366,fused_moe_asmjit_aot__64_True_True,2.14%,0.0,,0%,906.8664583333366,1,0,170.5,555.01, +80,8192,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,1658.349010204088,fused_moe_asmjit_aot__64_True_True,2.20%,0.0,,0%,1658.349010204088,1,0,186.47,333.86, +80,16384,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,3156.37568686868,fused_moe_asmjit_aot__64_True_True,2.19%,0.0,,0%,3156.37568686868,1,0,195.94,207.3, +80,32768,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,6262.433448979595,fused_moe_asmjit_aot__64_True_True,2.24%,0.0,,0%,6262.433448979595,1,0,197.52,136.63, +80,65536,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,12425.184968085092,fused_moe_asmjit_aot__64_True_True,2.28%,0.0,,0%,12425.184968085092,1,0,199.1,101.27, +80,131072,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,24933.576175257756,fused_moe_asmjit_aot__64_True_True,2.28%,0.0,,0%,24933.576175257756,1,0,198.44,82.76, diff --git a/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_untuned_fmoe.csv b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_untuned_fmoe.csv new file mode 100644 index 0000000000..d1d589dc79 --- /dev/null +++ b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_untuned_fmoe.csv @@ -0,0 +1,19 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +2,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +4,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +8,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +16,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +32,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +64,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +128,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +256,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +512,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +1024,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +2048,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +4096,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +8192,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +16384,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +32768,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +65536,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 +131072,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0 diff --git a/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv new file mode 100644 index 0000000000..46671db841 --- /dev/null +++ b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv @@ -0,0 +1,14 @@ +cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag +80,1,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,13.771117021276607,fused_moe_asmjit_aot__16_True_False,0.42%,0.0,,0%,13.771117021276607,1,0,2.28,58478.82, +80,2,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,34.621979999999716,fused_moe_asmjit_aot__16_True_False,0.54%,0.0,,0%,34.621979999999716,1,0,1.82,23260.68, +80,4,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,47.58002000000011,fused_moe_asmjit_aot__16_True_False,0.58%,0.0,,0%,47.58002000000011,1,0,2.64,16926.34, +80,8,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,67.1539799999998,fused_moe_asmjit_aot__16_True_False,0.76%,0.0,,0%,67.1539799999998,1,0,3.75,11993.4, +80,16,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,101.03224000000053,fused_moe_asmjit_aot__16_True_False,0.24%,0.0,,0%,101.03224000000053,1,0,4.98,7972.73, +80,32,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,142.52629999999968,fused_moe_asmjit_aot__16_True_False,0.26%,0.0,,0%,142.52629999999968,1,0,7.06,5652.99, +80,2048,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,604.5281000000002,fused_moe_asmjit_aot__64_True_True,1.68%,0.0,,0%,604.5281000000002,1,0,106.57,1373.75, +80,4096,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,996.1099100000009,fused_moe_asmjit_aot__128_True_True,1.81%,0.0,,0%,996.1099100000009,1,0,129.35,858.98, +80,8192,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,1717.7659090909092,fused_moe_asmjit_aot__64_True_True,1.82%,0.0,,0%,1717.7659090909092,1,0,150.02,527.41, +80,16384,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,3009.697379999994,fused_moe_asmjit_aot__128_True_True,1.82%,0.0,,0%,3009.697379999994,1,0,171.25,334.46, +80,32768,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,5728.849888888895,fused_moe_asmjit_aot__128_True_True,1.83%,0.0,,0%,5728.849888888895,1,0,179.93,210.86, +80,65536,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,11492.974020618547,fused_moe_asmjit_aot__128_True_True,1.85%,0.0,,0%,11492.974020618547,1,0,179.38,140.14, +80,131072,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,22660.053193877586,fused_moe_asmjit_aot__128_True_True,1.85%,0.0,,0%,22660.053193877586,1,0,181.96,106.62, diff --git a/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_untuned_fmoe.csv b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_untuned_fmoe.csv new file mode 100644 index 0000000000..76a4be4677 --- /dev/null +++ b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_untuned_fmoe.csv @@ -0,0 +1,19 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +2,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +4,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +8,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +16,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +32,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +64,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +128,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +256,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +512,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +1024,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +2048,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +4096,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +8192,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +16384,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +32768,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +65536,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 +131072,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0 diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 131a92d3c7..e7cc14dd30 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -346,6 +346,22 @@ def fused_moe_( gate_mode, ) + if metadata.stage0 is not None: + return metadata.stage0( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + activation, + quant_type, + w1_scale, + w2_scale, + expert_mask, + num_local_tokens, + moe_sorting_dispatch_policy, + ) + block_size_M = metadata.block_m if block_size_M is None else block_size_M # Ensure block_size_M is int (metadata.block_m from CSV may be float) if block_size_M is not None: @@ -719,6 +735,7 @@ class MOEMetadata: use_non_temporal_load: bool = True fuse_quant: str = "" stage2_has_bias: bool = False + stage0: Callable = None def _needs_swiglu_bias_support(dtype, quant_type): @@ -1131,6 +1148,20 @@ def _lookup_cfg(c2s): f"[fused_moe] using {'1stage' if run_1stage else '2stage'}{' xbf16' if run_1stage_xbf16 else ''} {'default' if cfg is None else tag} for {keys} " ) + if kernelName1.startswith("fused_moe_asmjit_aot"): + from aiter.fused_moe_asmjit_aot import fused_moe_asmjit_aot + + return MOEMetadata( + None, + None, + block_m, + ksplit, + run_1stage, + stage0=functools.partial( + fused_moe_asmjit_aot, config_string=kernelName1.split("__")[1] + ), + ) + def get_block_m() -> int: if q_dtype_a == dtypes.fp8: return 32 diff --git a/aiter/fused_moe_asmjit_aot.py b/aiter/fused_moe_asmjit_aot.py new file mode 100644 index 0000000000..dc9884b733 --- /dev/null +++ b/aiter/fused_moe_asmjit_aot.py @@ -0,0 +1,338 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +from typing import Any, Optional + +import torch + +import aiter +from aiter import ActivationType, QuantType +from aiter.jit.utils.chip_info import get_gfx +from aiter.fused_moe import moe_sorting +from csrc.cpp_itfs.hsaco_tools import hsaco + +from dataclasses import dataclass + + +@dataclass +class Config: + BLOCK_M: int + use_down_loopn: bool + use_prefill: bool + + def to_string(self): + return ( + str(self.BLOCK_M) + + "_" + + str(self.use_down_loopn) + + "_" + + str(self.use_prefill) + ) + + @classmethod + def from_string(cls, data: str): + parts = data.split("_") + return cls(*[eval(p) for p in parts]) + + +def get_tune_space(): + return [ + Config(16, True, False).to_string(), + Config(64, True, True).to_string(), + Config(128, True, True).to_string(), + ] + + +def fused_moe_asmjit_aot( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + activation: ActivationType, + quant_type: QuantType, + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + expert_mask: Any, + num_local_tokens: Any, + moe_sorting_dispatch_policy: int, + config_string: str, +) -> Optional[torch.Tensor]: + + # decode kernel configs from kernel name + kcfgs = Config.from_string(config_string) + + B = int(hidden_states.shape[0]) + if ( + hidden_states.dtype != torch.bfloat16 + or expert_mask is not None + or activation != ActivationType.Silu + or w1.dtype != torch.float8_e4m3fnuz + or w2.dtype != torch.float8_e4m3fnuz + ): + raise Exception("Unsupported input") + if get_gfx() != "gfx942": + raise Exception("Unsupported platform") + + if quant_type != QuantType.per_Token and quant_type != QuantType.per_Tensor: + raise Exception(f"Unsupported quant_type:{quant_type}") + + qtype_str = str(quant_type).split(".")[1] + + E, N1, K1 = w1.shape + N2, K2 = w2.shape[1], w2.shape[2] + TOPK = topk_ids.shape[1] + fp8_ptpc = w1.dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz) and ( + quant_type == QuantType.per_Token + ) + num_CU = torch.cuda.get_device_properties( + hidden_states.device + ).multi_processor_count + assert N1 == 2 * K2 + + topk_w_f32 = ( + topk_weight if topk_weight.dtype == torch.float32 else topk_weight.float() + ) + + gemm1_out = torch.empty( + [B, TOPK, N1 // 2], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + + if kcfgs.use_prefill: + BLOCK_TILE_SIZE_N = 128 + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, cur_out = ( + moe_sorting( + topk_ids, + topk_weight, + E, + N2, # reduce dim is same with output dim + hidden_states.dtype, + kcfgs.BLOCK_M, + None, + None, + 0, + ) + ) + quant_func = aiter.get_hip_quant(aiter.QuantType.per_Token) + hidden_states_q, hidden_states_scale = quant_func( + hidden_states, + scale=None, + quant_dtype=w1.dtype, + num_rows=None, + ) + hsaco.fmoe_asmjit.moe_2stage_gateup( + [N1 // BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0]], + [256], + hidden_states_q, + w1, + gemm1_out, + sorted_ids, + sorted_expert_ids, + num_valid_ids, + hidden_states_scale, + w1_scale, + B, + N1 // BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0], + weight_dtype=str(w1.dtype), + TOPK=TOPK, + K=K1, + N=N1, + BLOCK_TILE_SIZE_M=kcfgs.BLOCK_M, + BLOCK_TILE_SIZE_N=BLOCK_TILE_SIZE_N, + quant_type_w=f"QuantType.{qtype_str}", + ) + gemm1_out_q, gemm1_out_scale = quant_func( + gemm1_out.view(B * TOPK, -1), + scale=None, + quant_dtype=w2.dtype, + num_rows=None, + ) + gemm2_out = torch.empty( + B, TOPK, N2, dtype=torch.bfloat16, device=gemm1_out_q.device + ) + hsaco.fmoe_asmjit.moe_2stage_down( + [1, sorted_expert_ids.shape[0]], + [256], + gemm1_out_q, + w2, + gemm2_out, # cur_out, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + gemm1_out_scale, + w2_scale, + B, + sorted_expert_ids.shape[0], + weight_dtype=str(w2.dtype), + TOPK=TOPK, + K=K2, + N=N2, + with_silu=False, + BLOCK_TILE_SIZE_M=kcfgs.BLOCK_M, + BLOCK_TILE_SIZE_N=BLOCK_TILE_SIZE_N, + quant_type_w=f"QuantType.{qtype_str}", + ) + num_WG = num_CU * 4 + num_tokens_wg = B // num_WG + num_extra_tokens = B % num_WG + hsaco.fmoe_asmjit.moe_gemm_final_reduce_bf16( + [num_WG], + [64], + gemm2_out, + cur_out, + num_tokens_wg, + num_extra_tokens, + B, + TOPK=TOPK, + OC=N2, + ) + return cur_out + + if B == 1: + assert N1 == 2 * K2 + cur_out = torch.zeros( + [1, N2], dtype=hidden_states.dtype, device=hidden_states.device + ) + hsaco.fmoe_asmjit.moe_gemm_batch1( + [N1 // 32, TOPK], + [256], + hidden_states, + w1, + gemm1_out, + topk_ids, + topk_w_f32, + w1_scale, + 1, + N1, + K1, + weight_dtype=torch.float8_e4m3fnuz, + with_silu=True, + quant_type_str=qtype_str, + ) + hsaco.fmoe_asmjit.moe_gemm_batch1( + [N2 // 32, TOPK], + [64], + gemm1_out, + w2, + cur_out, + topk_ids, + topk_w_f32, + w2_scale, + 1, + N2, + K2, + weight_dtype=torch.float8_e4m3fnuz, + with_silu=False, + quant_type_str=qtype_str, + ) + elif 2 <= B <= 32: + # Stage 1: Shared ``moe_sorting`` + ``moe_gemm_batch``; + # stage 2: Choose between ``moe_2stage_down_loopn`` and ``moe_2stage_splitk`` based on ``use_down_loopn`` condition. + BLOCK_M = kcfgs.BLOCK_M + sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, cur_out = ( + moe_sorting( + topk_ids, + topk_weight, + E, + K1, + hidden_states.dtype, + BLOCK_M, + expert_mask, + num_local_tokens, + moe_sorting_dispatch_policy, + ) + ) + grid = int(sorted_expert_ids.shape[0]) + if B * TOPK <= E: + grid = B * TOPK + + hsaco.fmoe_asmjit.moe_gemm_batch( + [N1 // 32, grid], + [256], + hidden_states, + w1, + gemm1_out, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + w1_scale, + B, + N1, + K1, + TOPK, + weight_dtype=torch.float8_e4m3fnuz, + with_silu=True, + quant_type_str=qtype_str, + ) + + BLOCK_N = 1024 + if kcfgs.use_down_loopn: + # extra checks + use_down_loopn = ( + fp8_ptpc + and (N2 // BLOCK_N) * grid >= num_CU + and N2 % BLOCK_N == 0 + and 16 <= B <= 32 + ) + else: + use_down_loopn = False + + if use_down_loopn: + gemm2_out = torch.empty( + [B, TOPK, N2], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + hsaco.fmoe_asmjit.moe_2stage_down_loopn( + [N2 // BLOCK_N, grid], + [256], + gemm1_out, + w2, + gemm2_out, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + w2_scale, + B, + weight_dtype=torch.float8_e4m3fnuz, + TOPK=TOPK, + K=K2, + N=N2, + BLOCK_TILE_SIZE_M=16, + BLOCK_TILE_SIZE_N=16, + fp8_ptpc=True, + BLOCK_N=BLOCK_N, + atomic_write=False, + STAGES=3, + ) + cur_out = torch.sum(gemm2_out, dim=1) + else: + BLOCK_TILE_SIZE_N = 64 + hsaco.fmoe_asmjit.moe_2stage_splitk( + [N2 // BLOCK_TILE_SIZE_N, grid], + [64], + gemm1_out, + w2, + cur_out, + sorted_ids, + sorted_weights, + sorted_expert_ids, + num_valid_ids, + w2_scale, + B, + weight_dtype=torch.float8_e4m3fnuz, + TOPK=TOPK, + K=K2, + N=N2, + with_silu=False, + BLOCK_TILE_SIZE_M=16, + BLOCK_TILE_SIZE_N=BLOCK_TILE_SIZE_N, + quant_type_str=qtype_str, + ) + else: + raise Exception(f"Unsupported batch-size {B}") + return cur_out diff --git a/aiter/utility/base_tuner.py b/aiter/utility/base_tuner.py index f887451651..1890ab1bff 100644 --- a/aiter/utility/base_tuner.py +++ b/aiter/utility/base_tuner.py @@ -201,6 +201,12 @@ def _setup_common_arguments(self): "If a tuned CSV path is given, read shapes and kernels from it; " "otherwise read shapes from -i and run with default kernels.", ) + self.parser.add_argument( + "--e2e_tune", + action="store_true", + required=False, + help="Run an extra round of e2e tuning after main tuning is done, using production-op benchmark as the indicator", + ) self.parser.add_argument( "--compare", action="store_true", diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index f2a40f9c92..382914e39b 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -23,6 +23,8 @@ cktile_moe_stage1, cktile_moe_stage2, ) +from aiter.fused_moe_asmjit_aot import fused_moe_asmjit_aot +from aiter.fused_moe_asmjit_aot import get_tune_space from aiter import ck_moe_stage1_fwd, ck_moe_stage2_fwd, dtype2str_dict from aiter.ops.shuffle import ( shuffle_weight, @@ -2812,10 +2814,12 @@ def gen_flydsl_i4_2stages_task(self, info, blockMs): return tasks_flydsl - def run_config(self, args): + def run_config(self, args, target_fused_moe=None, try_extra_ref=False): from aiter.fused_moe import fused_moe, fused_topk from aiter.test_common import run_perftest, checkAllclose + if target_fused_moe is None: + target_fused_moe = fused_moe untunedf = self.untunedf results = [] for i in range(len(untunedf)): @@ -2985,7 +2989,7 @@ def run_config(self, args): a1_qt, a1_scale = torch_quant(hidden, quant_dtype=q_dtype_a) out, us = run_perftest( - fused_moe, + target_fused_moe, hidden, w1_qt_fmoe, w2_qt_fmoe, @@ -3019,6 +3023,62 @@ def run_config(self, args): err_ratio = 1.0 else: err_ratio = checkAllclose(out, ref, msg=f"run_config {shape_str}") + if try_extra_ref: + # try compare with extra references (due to different implementations) + try: + # use weight-decompression only algorithm as second reference + w1_deq = w1_qt.to(dtype=hidden.dtype) * w1_scale.view( + w1_scale.shape[0], -1, 1 + ).to(dtype=hidden.dtype) + w2_deq = w2_qt.to(dtype=hidden.dtype) * w2_scale.view( + w2_scale.shape[0], -1, 1 + ).to(dtype=hidden.dtype) + + ref2 = self.torch_moe_2stages( + hidden, + w1_deq, + w2_deq, + topk_weights, + topk_ids, + dtype=dtype, + activation=act_type, + quant_type=QuantType.No, + doweight_stage1=doweight_stage1, + ) + err_ratio2 = checkAllclose( + out, ref2, msg=f"run_config {shape_str}" + ) + err_ratio = min(err_ratio, err_ratio2) + except Exception: + pass + + if q_type == QuantType.per_Tensor: + try: + # inputs are quantized per-Token while weights are quantized per-Tensor + a1_qt, a1_scale = aiter.get_torch_quant( + QuantType.per_Token + )(hidden, quant_dtype=q_dtype_a) + ref2 = self.torch_moe_2stages( + a1_qt, + w1_qt, + w2_qt, + topk_weights, + topk_ids, + a1_scale=a1_scale, + w1_scale=w1_scale, + w2_scale=w2_scale, + dtype=dtype, + activation=act_type, + quant_type=QuantType.per_Token, + doweight_stage1=doweight_stage1, + ) + err_ratio2 = checkAllclose( + out, ref2, msg=f"run_config {shape_str}" + ) + err_ratio = min(err_ratio, err_ratio2) + except Exception: + pass + if err_ratio <= args.errRatio: status = "ok" else: @@ -3029,6 +3089,7 @@ def run_config(self, args): "e2e_us": us, "kernel_us": kernel_us, "status": status, + "err_ratio": err_ratio, } ) except Exception as e: @@ -3038,6 +3099,7 @@ def run_config(self, args): "e2e_us": -1, "kernel_us": kernel_us, "status": f"error:{e}", + "err_ratio": 1, } ) finally: @@ -3666,6 +3728,178 @@ def pre_process(self, args): ) self.untunedf = self.untunedf[~mask] + def e2e_tune(self, args): + """ + choosing best kernels based on (stage1_us + stage2_us) or (single_stage_us) + may overlook some overheads between stages, and this e2e tune is a complement. + """ + results_base = self.run_config(args, target_fused_moe=None, try_extra_ref=True) + better_kernels = {} + cu_num = self.get_cu_num() + + for i in range(len(self.untunedf)): + e2e_us = results_base[i]["e2e_us"] + err_ratio = results_base[i]["err_ratio"] + row = self.untunedf.iloc[i] + cu_num = int(row["cu_num"]) + token = int(row["token"]) + model_dim = int(row["model_dim"]) + inter_dim = int(row["inter_dim"]) + expert = int(row["expert"]) + topk = int(row["topk"]) + act_type = eval(row["act_type"]) + dtype = eval(row["dtype"]) + q_dtype_a = eval(row["q_dtype_a"]) + q_dtype_w = eval(row["q_dtype_w"]) + q_type = eval(row["q_type"]) + q_type = QuantType.per_1x128 if q_type == QuantType.per_128x128 else q_type + use_g1u1 = bool(row["use_g1u1"]) + doweight_stage1 = bool(row["doweight_stage1"]) + key = ( + cu_num, + token, + model_dim, + inter_dim, + expert, + topk, + act_type, + dtype, + q_dtype_a, + q_dtype_w, + q_type, + use_g1u1, + doweight_stage1, + ) + keyname = " ".join(map(str, row[self.keys].values)) + better_kernels[i] = { + "name": keyname, + "key": key, + "row": row, + "kernel_name": None, + "e2e_us": e2e_us, + "err_ratio": err_ratio, + "e2e_us_base": e2e_us, + "err_ratio_base": err_ratio, + } + print(keyname, e2e_us, err_ratio) + + from functools import partial + + def target_fused_moe( + hidden_states, + w1, # [expert(local_expert:EP), inter_dim*2, dim] N,K + w2, # [expert(local_expert:EP), dim, inter_dim] + topk_weight, + topk_ids, + expert_mask=None, + activation=ActivationType.Silu, + quant_type=QuantType.No, + doweight_stage1=False, + w1_scale=None, + w2_scale=None, + num_local_tokens=None, + moe_sorting_dispatch_policy=0, + dtype=None, + config_string="", + ): + return fused_moe_asmjit_aot( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + activation, + quant_type, + w1_scale, + w2_scale, + expert_mask, + num_local_tokens, + moe_sorting_dispatch_policy, + config_string=config_string, + ) + + GREEN = "\033[0;32m" + YELLOW = "\033[1;33m" + RED = "\033[0;31m" + END = "\033[0m" + for config_string in get_tune_space(): + results_cur = self.run_config( + args, + target_fused_moe=partial(target_fused_moe, config_string=config_string), + try_extra_ref=True, + ) + block_m = 16 + ksplit = 0 + run_1stage = 1 + err1 = "0%" + err2 = "0%" + kernelName1 = "fused_moe_asmjit_aot__" + config_string + kernelName2 = "" + xbf16 = 0 + for i in range(len(self.untunedf)): + k = better_kernels[i] + e2e_us = results_cur[i]["e2e_us"] + status = results_cur[i]["status"] + err_ratio = results_cur[i]["err_ratio"] + # skip invalid kernel + if e2e_us < 0 or status != "ok": + print( + f"{k['name']} {RED} {e2e_us=:.3f} {status=} {END} {kernelName1}" + ) + continue + row = self.untunedf.iloc[i] + print( + f"{k['name']} {YELLOW} {float(k['e2e_us_base']):.3f}us -> {float(e2e_us):.3f}us (err: {k['err_ratio']*100:.0f}%) {END} {kernelName1}" + ) + if e2e_us < k["e2e_us"]: + k["e2e_us"] = e2e_us + k["err_ratio"] = err_ratio + k["kernel_name"] = kernelName1 + tflops, bw = self.calculate( + (k["key"], "stage1", kernelName1, block_m, e2e_us, err1) + ) + k["results"] = ( + block_m, + ksplit, + e2e_us, + kernelName1, + f"{err_ratio*100:.2f}%", + 0.0, + kernelName2, + err2, + e2e_us, + run_1stage, + xbf16, + tflops, + bw, + ) + + tune_results = [] + + for i, k in better_kernels.items(): + if k["kernel_name"] is None: + continue + tune_results.append([*k["row"].values, *k["results"]]) + print( + f"{k['name']} {GREEN} {float(k['e2e_us_base']):.3f}us -> {float(k['e2e_us']):.3f}us (err: {k['err_ratio_base']*100:.0f}% -> {k['err_ratio']*100:.0f}%) {END} {k['kernel_name']}" + ) + + new_tunedf = pd.DataFrame(tune_results, columns=self.columns) + output_file = self.get_out_file(args.tune_file) + old_tunedf = self.get_tuned_gemm_list(output_file) + + if "_tag" == old_tunedf.columns[-1]: + new_tunedf["_tag"] = "" + self.columns.append("_tag") + + resultdf = self.update_tunedf(old_tunedf, new_tunedf) + + if "_tag" == old_tunedf.columns[-1]: + self.columns.pop(-1) + + resultdf.to_csv(output_file, index=False) + print(f"{args.tune_file} has been updated!") + if __name__ == "__main__": key = [ @@ -3701,4 +3935,8 @@ def pre_process(self, args): tuner = FmoeTuner("fmoeTuner", key, resultList, "fmoe tuner") args = tuner.parse_args() - tuner.run(args, False) + if args.e2e_tune: + tuner.pre_process(args) + tuner.e2e_tune(args) + else: + tuner.run(args, False) diff --git a/csrc/cpp_itfs/hsaco_tools.py b/csrc/cpp_itfs/hsaco_tools.py new file mode 100644 index 0000000000..6435e2b1b1 --- /dev/null +++ b/csrc/cpp_itfs/hsaco_tools.py @@ -0,0 +1,261 @@ +import ctypes +from ctypes.util import find_library +import functools +import torch +import os +import subprocess + +from aiter.jit.utils.chip_info import get_gfx +from csrc.cpp_itfs.utils import AITER_CORE_DIR + +_is_hip_library_api_supported_ = False + + +@functools.cache +def get_amdhip(): + global _is_hip_library_api_supported_ + + try: + lib = ctypes.CDLL(find_library("amdhip64")) + except Exception as e: + print(e) + torch_amdhip64 = os.path.join(torch.__path__[0], "lib", "libamdhip64.so") + print(f"Try {torch_amdhip64} instead...") + lib = ctypes.CDLL(torch_amdhip64) + lib.hipModuleLoad.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_char_p] + lib.hipModuleLoad.restype = ctypes.c_int32 + lib.hipModuleGetFunction.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_void_p, + ctypes.c_char_p, + ] + lib.hipModuleGetFunction.restype = ctypes.c_int32 + lib.hipModuleLaunchKernel.argtypes = [ + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, # unsigned int sharedMemBytes + ctypes.c_void_p, # hipStream_t stream + ctypes.c_void_p, # void **kernelParams + ctypes.c_void_p, # void **extra + ] + lib.hipModuleLaunchKernel.restype = ctypes.c_int32 + lib.hipGetErrorString.argtypes = [ctypes.c_int32] + lib.hipGetErrorString.restype = ctypes.c_char_p + + try: + lib.hipLibraryLoadFromFile.restype = ctypes.c_int32 + lib.hipLibraryLoadFromFile.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + ctypes.c_char_p, + ctypes.c_void_p, # hipJitOption *jitOptions + ctypes.c_void_p, # void **jitOptionsValues + ctypes.c_uint32, # unsigned int numJitOptions, + ctypes.c_void_p, # hipLibraryOption *libraryOptions + ctypes.c_void_p, # void **libraryOptionValues + ctypes.c_uint32, # unsigned int numLibraryOptions + ] + + lib.hipLibraryGetKernelCount.restype = ctypes.c_int32 + lib.hipLibraryGetKernelCount.argtypes = [ + ctypes.POINTER(ctypes.c_uint32), # unsigned int *count, + ctypes.c_void_p, # hipLibrary_t library + ] + + lib.hipLibraryEnumerateKernels.restype = ctypes.c_int32 + lib.hipLibraryEnumerateKernels.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), # hipKernel_t *kernels + ctypes.c_uint32, # unsigned int numKernels, + ctypes.c_void_p, # hipLibrary_t library + ] + + lib.hipKernelGetName.restype = ctypes.c_int32 + lib.hipKernelGetName.argtypes = [ + ctypes.POINTER(ctypes.c_char_p), # const char **name + ctypes.c_void_p, # hipKernel_t kernel + ] + _is_hip_library_api_supported_ = True + except Exception: + _is_hip_library_api_supported_ = False + + return lib + + +def hip_check_error(err, *args): + if err != 0: + raise Exception( + "HIP error:" + + get_amdhip().hipGetErrorString(err).decode("utf-8") + + repr(args) + ) + + +@functools.cache +def get_lib(lib_fpath): + hip = get_amdhip() + p_lib = ctypes.c_void_p() + hip_check_error( + ( + hip.hipLibraryLoadFromFile( + ctypes.byref(p_lib), + lib_fpath.encode("utf-8"), + None, + None, + 0, + None, + None, + 0, + ) + if _is_hip_library_api_supported_ + else hip.hipModuleLoad(ctypes.byref(p_lib), lib_fpath.encode("utf-8")) + ), + lib_fpath, + ) + return p_lib + + +@functools.cache +def get_all_kernel_names(co_path): + # we need both demangle & symbol name for loading & argtype parsing + dynamic_syms_raw = subprocess.check_output( + ["/opt/rocm/llvm/bin/llvm-objdump", "--dynamic-syms", co_path] + ).decode("utf-8") + kernel_names = [] + for line_raw in dynamic_syms_raw.splitlines(): + ls = line_raw.split() + if len(ls) < 7: + continue + if ls[3] != ".text": + continue + symbol_name = line_raw.split()[6] + kernel_names.append(symbol_name) + return kernel_names + + +@functools.cache +def get_kernel(kernel_path_prefix, constexpr_args: tuple = ()): + """ + constexpr_args is compile-time args which are part of co-file name + """ + hip = get_amdhip() + + co_suffix = "" + for k, v in constexpr_args: + co_suffix += f"-{k}={v}" + co_suffix += ".co" + + if ":" in kernel_path_prefix: + # file contain many kernels, filename is not started with kernel name + kernel_path_base, kernel_name = kernel_path_prefix.split(":") + lib_fpath = kernel_path_base + co_suffix + else: + # file contain only one kernel, filename starts with kernel name + _, kernel_name = os.path.split(kernel_path_prefix) + lib_fpath = kernel_path_prefix + co_suffix + + p_lib = get_lib(lib_fpath) + + if _is_hip_library_api_supported_: + kernel_cnt = ctypes.c_uint32() + hip_check_error(hip.hipLibraryGetKernelCount(ctypes.byref(kernel_cnt), p_lib)) + + assert kernel_cnt.value > 0 + kernels = (ctypes.c_void_p * kernel_cnt.value)() + + hip_check_error(hip.hipLibraryEnumerateKernels(kernels, kernel_cnt, p_lib)) + + p_func = None + for k in kernels: + p_name = ctypes.c_char_p() + hip_check_error(hip.hipKernelGetName(ctypes.byref(p_name), k)) + assert p_name.value is not None + cur_kernel_name = p_name.value.decode("utf-8") + if kernel_name in cur_kernel_name: + p_func = k + break + else: + p_func = None + for cur_kernel_name in get_all_kernel_names(lib_fpath): + if kernel_name in cur_kernel_name: + p_func = ctypes.c_void_p() + hip_check_error( + hip.hipModuleGetFunction( + ctypes.byref(p_func), p_lib, cur_kernel_name.encode("utf-8") + ) + ) + break + + assert p_func is not None, f"kernel {kernel_name} is not found in {lib_fpath}" + + def CallableKernel( + gridDims: list[int], + blockDims: list[int], + *args, + sharedMemBytes=0, + ): + fields = [] + for i, arg in enumerate(args): + if arg is None or isinstance(arg, torch.Tensor): + fields.append((f"arg_{i}", ctypes.c_void_p)) + elif isinstance(arg, int): + # ctypes.c_uint/ctypes.c_ulong + fields.append((f"arg_{i}", ctypes.c_int)) + elif isinstance(arg, float): + fields.append((f"arg_{i}", ctypes.c_float)) + else: + raise Exception(f"Unsupported arg type: {arg}") + + class Args(ctypes.Structure): + _fields_ = fields + + kernel_args = Args() + for i, a in enumerate(args): + setattr( + kernel_args, + f"arg_{i}", + a.data_ptr() if isinstance(a, torch.Tensor) else a, + ) + ExtraType = ctypes.c_void_p * 5 + kernel_args_size = ctypes.c_uint64(ctypes.sizeof(kernel_args)) + kernel_config = ExtraType( + 1, ctypes.addressof(kernel_args), 2, ctypes.addressof(kernel_args_size), 3 + ) + stream = ctypes.cast(torch.cuda.current_stream(), ctypes.c_void_p) + while len(gridDims) < 3: + gridDims.append(1) + while len(blockDims) < 3: + blockDims.append(1) + hip_check_error( + hip.hipModuleLaunchKernel( + p_func, + *gridDims, + *blockDims, + sharedMemBytes, + stream, + 0, + ctypes.byref(kernel_config), + ) + ) + + return CallableKernel + + +class HSACO: + def __init__(self, base=None): + self.base = f"{AITER_CORE_DIR}/hsa/{get_gfx()}" if base is None else base + + def __getattr__(self, name): + return HSACO(f"{self.base}/{name}") + + def __call__(self, *args, **kwargs): + # kwargs is hsaco file name + # args is runtime-args + kernel = get_kernel(self.base, tuple(kwargs.items())) + kernel(*args) + + +hsaco = HSACO() diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..126068d82e6c95ffa1890e0d3d3b550bc0c77faf GIT binary patch literal 19384 zcmeHP4Ro7VdA^opN4ox#)yPP)q)M^kpVU?)+wxx$S8-g&F^cQDZkh%+vSi7Y6IpU4 z*-4?eAC5_ASK>63(v=Rn^4IZGC~O@Jj_nl=<7|x0Wru~qVLgm7wqXY2u){eU)t-Cr z_t~=IG>SkNKHu}+_kHixr|-Vs`|8c7hwljVstOAH8X{h5@+v8ij~V#o?{M&?uIRnE)MKW8HB zyu8tMDxcL|uw(OW_r>O7f9-6+Ti~1Zkuji(>~9x;5v9h#U<1;>&ziOlFn+@aJ&`uz(pxI>XhDw<9Q)6=1`=tn2w$#5wC zF?Vu&JRQxDkGRv5Z;fg{;hqh}XQIK=lZnWugi0_xGd>3&emsRBR?y1n(;CMWg37$!&PRk2mQ);jY5}~Q+x9fu^yuPVqG}x5Rz+W(u zJd+rly*tSM8yx583Eu4qPELN&J)N4I4P~P1YutDX}8=lX7I`VnAgln;1(p0@xoSO=IK4_}Pw@CMEDF zr?x$M(USGJ~=jjNw_?ntp+r2(-hu7E9)ZytE8xKW79g%Rj zrET+y1smK1WJSLJZKdjj&|2_mvVfDI-0!y1`%TKEf}#~ZpQ>nutw{p%x1cUsF)Y!%n3lByQ7RaHW#>itsHeAS|*$|9L8m=*J?ujTJ`e4MMQI{yTGsu!$_)~uD! zW0kCnS*umdV|_-l&RZ9&tW^@9ORBnPsd^ge)cmpPD#nN%*7&+Fe)aZy{Lm2{GBUVl-4 zu~J_ty-9l0{Gv)zGq2IO@cG16G};)E2EVH&C2EEmQliOfXiUOfSxrf71<2%uq@<8x z5h*FkYRY14gJw1Qm>tGKE%wxF^&7A)d8tF;O}t6k`69gYi*U9R$Kd#t?59?Q~Q zVtkiNjMEy&g_=5-q23iURJmf=k~%S7w?~YZG&n9axLwA*dt%0_J+W+=Ta3FsV!W)$ zaiOWjW%7AqrYcV?t8WqGEv;f)-|o23-s!5^*BYy;YK>(LonpLmzZf@kIWBbhUG{_f zWA>{3v8=`~#{J!5T+`#Y(9`R39`24gt1PiBdCwAycst3ZB_nx!>CwWH%O&%MivGRWi9B@*cOVa&H5dS5|5y<$V?5eeK}ytLSpkg`$f^lHnrJkGxt;79>AI z&4R>uxmNJ<62T36!3{eFH<|=DS_L;b1UI<^uc{NgszGqOM{s+K;HB+?m+oJ{+Mas5 zLQGl`eK|7L&5#qJr0!`1N^}j>ScYg87?+gslDzQ} zF}|XVm*tI@iE&aW&#%uL*UR%O~A)%66b`)CPE%vezdUg za+aK5A~?xea$%`xhf2Mp&@`tjw9J(}Rkm36XwmYDdSO|)Mzpv_bht*cLNe?ChnV-I$$5Lf$<8J#ltwTg>k3K(#|+= zKjU7N#m_j9+4tx!TbX@8_nej4_n1FxW%dE{%T{LJQ~!#U*$3*sZ)Ns99sg-%_JIzS zjoEh>3j6*-Vc%^O_Wed--@RMd_wN?=-9BO8?-TaDrNVxoRM_|0g#Ca`*!MOH`+-Jb z-+MsV4;+BKv5)RH_R|_;fYurZs7nW43|21biw9X6O4DJJO zJ4PGK$7!Sa1a+H7=w9%=N#Ry11|I}H-k{gxe!4&^ z9p@@7E?t!&maVi1ncnUyz10$1RM~4JyDK*Da=9+q_r&J+>_Oa$UojBU!_Y`b@3Zpw zXYBUayxp#hBPEMpDJP_Vz7p|J6TgB`SMC>RkOUf61QxSvvPb>sT(A1#T%Y>#T)+Cs zIf*>6B$4+nQPop(#j2<07WB`!B-IDz0z~`DK!LX~OBR=igOGupKUyun)w9kA!3{Zl z2;7*%1K`DstDc`@d55I>IOba9ak~5xUH+tMVVUg{m;c3Yx*k!PYJfFB7in2_l@hWW zxEojltO3>nYk@AZw(5Gb9ph`OHFw%c9qj60w+FZfSO>j2=+y)3fqQ^^fVTl}1J=Q= z?ruBA*H-u3YriV?G{8^8yHzGP&<$*WT?6bIfsH^n&<)%R+zV`gpN6;CF}}9y4%&$a zb{^O@0h@px=y{;$1$u!^z$Rcbuo>uqooCFB@wL^a33)wR#J(2zYk6E{Y6Z3eTVUS; zdmqpTYz4Lg+kkDr7Wiv9ZO8c9YHQL?+F{oYyG~#yupN5s(CYwp06T%5zWt#&kH~F{}&Ql>05wzV>art+Q+$Bf6B0%|C3Ow-Ga!Kg8x6 zF<-W^d4}tsvynYqF0pw=>OX5^{)anWwvl~YzQE=g>3GG)<{2u)JYRlnxzNt?ohNWl zpZngx*@wwa+J_&2z#i<{8@koQ?TEv0L~b^1W-(#g=+W_Li za6w1m!!N_XYs3HGLmU22mJ0vJY{LIZoA7_EQTRXEDEuEgApD;^0RP6%(EE&^r6-Jk zOy6RBi4O944i-NA=a^^D#ymrknCEy0^!Gvk0Q7HviH;aQPwzMW89izI5)B%c=*U6v zL*RFSA6}xP<`o(;zf8mCuh22`SLtX6_&)Fh;J1I3M$BKMQS)EYar4*dg!vmZauECw z_#NPfzd>WRzonD5SLi9*-_g^yZ_y#Xz9WU~^DL~~SYKd4tS{C9{e93s0R7v)MdP+B zbjtP*G-3NLP1?Rk;|IYHf!_gs_PKcv$g;QPQ2fZzT@nsNS! z&N%;#&N_cg&p3ZVGY7#Bf!_gs_$QRb>tV!e7E_O7PaQ`rJOMlbJdQnd92eVt!25tF zfG2=&0lo!z9DC~c=j<3?TRriob}|CH5!jsso&=6SZv=Yx1Mdf(1fB#2fkEI1>_)z5 z$N1Xn$z^$O=HywU@H2{77zT!cqp%x=T?iNghJj(=7;p?Y3O}P?wqty4HT)NL5`kR= zcH_WtU<7&*=tY51;5cv`I02jhMqn5DD?7&5R>%KFUe6YJUJU+Xh=r$sr+_io$6!AR zoCKZ%o&uf*o(9I?FZNA4#@AL){k@&UVHby80+;~Cp%;hV6mSZd049J*U=kRIUHm(C zjIXUG{!w06PM$dpKhubXDPRgX4ZCUBJpg}G+pzzp;<(3=6y0B3=-z%#%zzzpm%|8B?l+Uo3o$m`iE&t z{FlYhQFdq`FZrUVk(>OX7cOn#Z>11*O7l;FQ)#s^YiP) z)ZcXeuEf-Ll$T3GznbUY+m$|5vRHyz_&dm7d2WotQcE5~E&OiOvlo{{Ev)CYu%6e# zmu{dIKBv#>^*Och2Iq3Q7H0VjuWi})b-Xs-(BnL?!8h3{H8Jc`a;<7*dkguHgCX)9 zzQ>WzuxpRiqp5m77l-1oAT>30_j z{cfYs@7^u+yM1zPdWX>OzDMZymJ0n|o6zrV6#BgfwDaX-D) z*hQ^6@M7>%@bWI|F#D;~+)a0xduX+}mpXLd#o(pj<-OEpLv3n9ZE8bpYTKe#b)q(P zqBeD+Hg%#_)qxj-mx7n0R;@;DT8-MY8ntQl7PV?EYSUWOrnRU|Yf-D}z>C34!OO?x zT2ZgRgpYE87#!ye2AB30ee4{LvxZ78zd$TC3l_Vp_}0qTs~M54@|v-Aqh{PxQzCoi zPnW;vbwTs9_wIrCH^@=Ijbju22#jT$Kq~_`qW&jIx*L-PR(_y zXXo~--#XW&odiJU{10p3+@VfI9MVURQP-P*+xyb>0JR z$l-o)9pkFU;FHyn7lDuSIkFx=7Ks(S5 zycKvW(29GWb)OyMYpeF#<$IrhW4}1y$1$Zc?E>xsI$-C3ofGH;?gH)tRs*Yn4)}2# zvSWO0b=P6pU(bd=W&iPd_W`-y1#Y+A<@em2eb4LP8M*!iZnysB_3ndmy$igddiU*e zy$jrSy^GvJYsCj!0dx@3hd9B$DCN`%*UMufGufSpCE6tINE;>K=WV7*Jm^zx_txw|{rHjrr^M34i@}gl)`E|2@Lb zKOtD*O%H4}Zo>^oa2??KeI_ z-)wx6_IbgZ!F}LuPtt(-DSFiWG#xNML+>(2mQ8>(qZTG^d9HO=`rUg>AlVu=&%>O8Qcfn z_5wX#je4yb^;$LRwd$AXaW8l?xDUJy^;<3KwOZ6`wW!x>QNMY?o56kHZA+BJjb6kJ zw%7ZxC;PC+`+@zyKJ39h?7<_zBfx%OKk&`KHv{{yC;Q%G$N1W6|NG><*|)Jb1Mm~T z9v=V>00XcK!0ssUC~yEc0K5};Colj%f%n@nzP38>LD}CH_u4`D8^j(T0uBKOVLu4_ zyMT8Ahk!%CyMcED2jOqtqy%m_+9`08u#j9_!-6?KL$Jo9ERO6?Ct^H13U&i z2D}$|FK`%shCgA)_}c2RPumHv3$e#}T?o9Px)6Ju*M-1s*M)4a_sM%bz+$k-Irv`3 z4M@p7_+B5D>kMVDqyA8G5x&<^hcqyI>~)sc=jJ52^#I%J`8kTR*O|ZETt(UI%wKNK za;?42{N7A1bG^OJ*7uvpb*{J9uc{0Fk8>equfLMix? zBh{miRF6JV{T6+tM)Z*y(MM`TAE^<2B^`J%cqw@K><#4lsv1cp`aYLbqE95}=R8kG zpQ!6udHge*$8nYOK2rCJJf6ik&-Kx3l6~~ZI=xxwjN{9L#C{M>bWPnP`N(C*3Wq_ar#$ zJ(cHh)_XGKaMpV&U>vrNZ>(n6h(o+)c&A)50B@*fctoxlfHzb#y!*y`WWSbP*n8z( z7;xLYu)Lau*Bk|?LFAf+*B&`F%P8VsZq34aF1a;JjB(^Kqzm~GtKJz$?LrQNqgtP4 z9PvNH5b>YQ7s%@+AYZSBSy#(s#m70e4k^&u~2NO8*oKn;9UQ*yb(TfR8zHB31 zyw>5pq`-sd$zYy-rI*BOfJfJRNmTA7^&7=}{kui4r{5>$>Ayqtdiw7X{`;e%*V8{E z{0`*zl6Wm8{Igz?;(x#<{0}q={{shv|AC{z|G@pi|G<>+Kk#<=f4yE3uc^d5te3>* zxvCa1qxLYP_AsONFkfAZjM~sUvY~fmL+{9jUXmBQ8QcfnhF+2ry(1@jM^5yPoaiNa z!JEN-;BDw7Riigljowf-dPCKDy(AT@wZ!^ZFNv)$=7oMU^nK88LocZoy`x(6j%v|6 zszoo!3*HRw18+kwsUE$fdi0L!(L1U~FUbqu4DJJOLocZjy`x6-jvCQBYD6!|3*HRw z18>`+mvj>~Lw>HrdrLP{L$I7}i(HHMm=GIzj|sTl9uxmwB_~(oJtxFQ-g5$Ox97xr zONfoUw*T2NAIK_!P)Vq`mFV{R_ZdqZ2q z$TwOeUMt7m)En9&Ml%0zrHNdX{<(bth@&WR6{xZpk$0+#gUF?BVyr=;D z+YIsg#cSClJ8GB5HL`qE<}%#?Jgn*%V~^)Kj3bpCYix7FgQKa{<85)Yh1)%z zkhi0ywY90KBi!E7))8uJk3{h}RZDYZthsrtITY>)`|3$xGJ)r0I~{C6fk2PE3`gDG zcrq~&O$DbznHYNnFqlpTGci0On3|qS2ggE*V0vbHI+@BqW9)QjVlpv-aXc9q4bMyj zCllkzARfsJhVX1(G9AP^$0y^_^xkx8Y}3j)*~W4=&fu0u_-+HQjpynVCGmOgAQ6iq z5m?1x2YJlAS$>-=pU%_ol;uz5$&bqNC-UU?%km%C_`&-BQnLI?o_t=GYtdH7eZ5bX z|4*L&$7H$n)?95uz9h>{d49ew%LmcVvi!T=&HBe>`B(D%oR;N3$kTtDEZ28!_H#*=$5@{u=k+03 zZpzdD9GCwr`>q^acKAbCKAI;tv*#Gd&$5m@`CYO+PfXr@4f)ruA^-1d$lKJ+^Ut#R z{%5Ws|I{_)|9TDiudX4l-~sRId7ol(m6k}TEplJGA>Zox*E=EK>iO4)n0$-pUx_=B z%tVPBPx#`=Vd9=h%(!Erbd1RVXYx#nxKqhUC=(*?O!RDqYqRHpA&$mVNd&M+1mTOg z!|60}i=Pp2kEMl$OxY8|cuIP5YGw*Lh{){eY{DK_&V<6i7^V?N0!i@HWCj+K2@;$f z58`JCqCu~>tt03m!EiX3NCtUOZEE(4@kB%!9>ZhDlL`Fbz<4qqPo7~5+icN7g3+_l zv8hn{bTG>7+FHCpxY^{k^=hS#U{jt_`&CLlZ?Ji@QVSNm{s!9GJV8vKJ5vjso{1s0 z2g6a^1cFT-PgBsR$k_T6$tEj!+2qxm@3u{Gk>l2TwO=OMkpm?0H z9|M@4;lH4;ep%$n_~)J{+umoojTN4L{fF&?J@*Y9JbL6%ZZN)yV|M#vDK5Vq0rP*q z)u~+H?1(o_hnUALn3IW#QkI5AK^*1OlRKi&9%b!bd zycd7WUa4P{-;+>&muUU>hZMWK1^8rJep|<}a-UXy_lNZs*-NQ^l=G(-Vmv=2J`=_M zr}BMX`JE!YG9c^cO^<(@lV^|b5ZOzy&)3iAL(t8a<=V63+*iJC2!r`DrmxJOYs-%F zb!TOnCr?qa@5nPysDcl}ZvC^*@!^s4Ppy2RvGdKYMdkb}*Z&2`@=h`L(%#L1{Jwzo Z|10wNmHn!0|A)$jE*s?A|1Ttp@1g(z literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..bb2ed6cf474fa4ad7be8d6a0880961ef729c860f GIT binary patch literal 11800 zcmeHNZBScRdOlnUA&wD|u!SWdv;qqZ7=$DQNEom-alMX9yaW@6IK&ryA{ilRBw;(- z6yM0iPFxofZ@NvhJ3F4Vo21QdzP3%X-5sFWoo%O^ZcEbHI&CIRJJV_ZFq7R*JN?l{ zea`&=S;ny$cXt2edc<>|_kEx99-VvddyjR_xnCPO#8p%T)I>T|RxdMr!!eToSsYN1K)B6v(ve+FTCmuMk3j0F30CG!Ep4&>0~+-Oupny z$6~o?o_x!hOME4&eZ_e?n4F99Gl^8>J5mTAnv2Dv*;_QAz0(HtciMpIP8+b_X#;I{ z+JNg$8}Qv}1GrsoaUYylZkfYUsJir)c{%hbFWx{KQ6rxf^*!fwHa(Z&bJ6M9Xe!Ug z5@(|ko^{}_N61^^EsdLZ!yQZ}rc>HKxxvl^qkrg322;~>!RhGBU(Q5R_l(%@d1J%G zpN?j8iFE1;N8WbM2G8=bWH8U4NoQx|4zQGJJOruWZ1jV6{)D@CHXY?XxjfqABk41# z(bLCx_HQ&MqKiM~;uDGQJ2TnD>0my((Z|iNKwfd?HXg%0Z%8DUNhFiflSy83<{q65 zrIR0Y-gZ9cOyq6nTq+SuXJ^Ik^PByz_^xv<7tQgJNAV>RVIF-C&%A@Hv(brnSKp9W zX)j)Lo(Z0g#_gG$rYH61(a_j;7VqYQ0@T+L9N z$_`d9>r|*-@_S=qrxdUcU{GmDKrX}q$xJ4golv6GeWl(qR6Z7*a{~Uf=tWzSU$5%; zTGLwlz2-qyFWHB6NQKWkcyr<9S>hT8MeLjCU)E{w+ zU2#q)@&mp|Fy!tH`FdUcptsk9KzFa(@9y<`{4Reu7K{Y_kx!KRz_`2OHL`$#P~LZ2#r-mELWO2k%*Sa~S)T-;_!U%|RTJYtI}1>2R@)c{x)`t2 ztoj%S27rMIZuKybXt-52Bo_#0uOPxoMWttcg>bFsRxhuZ7EDW~qUmeio}M0)U|K4g zOjp>rsZ4aGk1VXHZsuRGELn;cQD+e>OGS%C(pjDt&RZ-Oo&{GeG%Pg~8^o9fp<$`m z&>+P$d|hZLG%Q&fEP}~`tjMdnS$m(+#`%VZ3(ulW)q-`&TC|FJtb%o^Xthdttlto< z1?y6SwLuVb2@Q*uhUWzpDXgfF(*&*pHx(?4S~Xc^&&<7QZWTbpvAIyUSZAOs737H( zLLOf+J!x31G1Le;L04eswT4>ZF5#|%{;P&XtwAeP3)KbvqJFVnUoY$yb{Cd7b!$Pb zcHsKNSJm1$5k@~$ks1|4HK|b-)if?3S5d8wuL7C9lGIc()FSyU zR>@Co_KP;VqqfN!uia^l7u9yjZ*P|TYKQ%zqt#*9+Z;FStcw@PSC?7E*hQXRHj-zS zFICoDsVUSgYRD7syb~&_0y=d2J)dFR93+y(N zgPM1Vc1qPk4YS)}cUT-X4JOfEQ==A?yc*=K67y=VXfA3NHA3AY(NBI{MHYkrL-m5d z_ztbachpF{Rxk0|T@tT0Nxa&+Agq$-e)E+5!m^4yxl$mt4mR#pq_#>^@#!)f`|JI6 z4ni!~s!^ty|7ls<#I#(kqo}D9$FCaS<5C@m>!h()S83uzZMjy3+EUbNM8AspHSlY- zc2%_|9#H(W%ugl1p!n5gezn@Js>FT6{3ztCPwbv0%XygB=MZBeTgh+5{3O)lD5!D4Z3n|6_RZt~JNm0m3n!{o=RUAXod zj32(681LGQ7rA$nm+Cp+rd^C1+~lQkDwCYYx|t`BYtF{dPX6P4dj)540Na3Vj2k#h zC*wdD<4v5!%Q(=-xSO*KG7e;VM|Hu<^qtjLtW57PzhGthPV<{qrgyZzV`ciz_V=tz z@9_VNmFYYEzpyfWbETy3sFd{0MoHgcl=RIllD?xw(l_@?`i@>n->Q@JPMxH0ZItxR zMoHh=CFz}AlD>6F(mRKs*9B;k?rz$wJ4pBF4pDm(_#W^U@V$qqLwA_A=#J1<-3Z;Q zJ4zkx;QPS4z`KvqHp3`wH;mB^!#Le%I8NJ|!1sW+fbTs{I}H=G%W#4^4U@FnaFTYm zgYN_H0`ERaUB)TuHioFj7@_yDq-OajgPZA1~C`JD%n&@qz`n3`t(w zK}an_9U+0Mvj26Cix)Uf@!!q-X#QZK7B;nbUKNmVhajPDRl*8(Pu8kz^8r<1{%+NU z`GcxU^8$HxSs=f&Ou5VRJGjO91?|%gfx9w)h-lwCTw$&(lBbr5osh%3{(Y@#r)h&9 z0pC%=N5FL@{1A8rpq*~C%czKL8y}-Rd z2Xu~Kx52-@*7{}J4Yj8Y?X*44nL2 zHqr@QCv;Ar6WEEkPQ-NqyMRuh6W9&x26jT%`LGTC^))9i_h*aR=R$j~CpeP_=mEN* zcR}w4x`7^`2iODb0lLthD{O;*ea$m%BVOpd(DecPfL_FT5!Vas1@-~^fc?OJpcgvt zj1B(vwZ636uaes9Lp#2wIMV=d0O*6x2b~}22Mz!Sfct^_fj+e3%h}*xUmG}MBZJTl zLN^Q?1`Z-_5OG7mA>c4@7td;yb|%J0`p1u=`*a81E$bzYyY3lW%xubij{E@g$3fUtruz$P|l5%_jT7FPh{!RoO_mfvd*H4?LE&$E1k=4>O& z<83cAk`B>+mgO02ztYIs_xfLGB>P4CV=T|0|IJ30$5V+sKfJtBS+If1pKb-_!d-BWrh{S8BIUcX#7q)P_ig&1?o~SO2j34q1b*OE zI$?O7K4f^4o-q6|ec138op6KqfcJv;zeOhvZ_`H%e@0Ilen5G{GM)5;?*|_OKd?-v zjH@(ge20dNKcZpdkLifmKX)a&o|W8Y{|-GS^>50J_#VXfBEJ8}G-CWq8a4hEjTzsi z)5f3Dh#!1E_z?JkpVGMbZ)w8(9zA9LJ33?j8I8Nad%%0a`+r80=4*7;{0}r`{*b24 zAJL>Ad_VXQ_<@fotJgbmOns;9p-m{+j%3mdvvz z(9Q&E;R)ag-~@CN&^-iv2zUZ`0{AfSVc-PXnfR^^{`IvJe`+I>&`m;j5_l3giMUC` zJpz0LcoKLL$OC!cBy^MCx52-@c5+4T&ldB%DYQ3*S{MR`fK$*+_>Lze(1fTw_`fHS}uU>xnl-?zcPzIN*GZ6pa@61o&H1xz9?iMUzd zEHDL30n@-VFbQ4qA8qijuciJ;?$1{9Tvki*pLwySj#hO|&KPIe(PO*2NG!GVQYJ|PBr1`K|Q{(I%B+ZM(nwn(qkgey(lYAVZjC-duD)l}C0|Hu6MRyFlkJwLxyP5ouh z-<6vBcy+Zf_P>P{fe6d&_ZvJ3pMO3dgU|1Q&+mcHyazNyINim@UuQ<;5bq$K!N4U04UPx5e}MOm3jN z8zep#a)#5h-DAN>G@TNc@zS%IxqP%cnLIt)mC2@0MZ@Na!F-T7^U;`fB>bsF9*RVY@QE0YZ8=fi?e6#UF2aXGd@9Y0MFtkhpNc5% zFqYvHDQv)rrIX3@8P>7u6h05(qi3Vx*wwmhqt|iXQKAxjZ~tmuxJ*VkvFfkN({54hm;#ZkA_T6t*d0JDj#B2}QGP zbIG=K;(UD{kaC9S(0-B5rHRg|oqYIKB-n9vZf7MC>Jo zh;1C%raBu9&xt#1wyDE5vjntxyknC|!uFbE=EMy;8@EN7O}y@9w(VWhZS?RKdpY=n z4;}sDefQq8S=sLFjJ*cf6iO`1!1DcpSCxsDoTiM5TK4W&^ve8B!6{kC=65WYR_4mQ zk8Q*JuVOK+=#}}Mg7|q_JQTm8m(Aidgte{sr{wvdg5`Rp{|bHqT9o)! z`NY?zLV$bmVKycI#ut=3D!^BI1WF0cyYXRqWq!Oduaft@ZtQPWbY&grm9^|UutNxl zu(7W+BT;EDEzXtk)64c@*{0|}2}llQAFN*4Co9L7r6+7&#@gJ_;7n`nJH(S>9mth zTch8-cb5Pm&=Mt2r0j zQQpdB6|?>5SY@BmKDZL&TqJV2vXj*x^?TpP_3tZ&Dk|2QW2X3N&~PLDsr{4Lq?{xb6_Koi?JNW*aQmK~d-@L&a`yGWSV)=( zcus^9v2b+anRiL$tDeAcJmDW2pNu7cbm=8eATd1gnVHu;flw$Bo|y1Y!~;X&mxrU{ z!9esC&v+y<5l)hqJQE|2g^OPGOa-EoVgHGdSm-rw;tx(nBH_e!0uLi*8i)-~28P35zCRw0?dWsv zcw<@Rp9&`?M#f{8HRRi#vA{`xBpOKi9~n=aPzFG|)Zr$G1;)bfH29A;w~dX5{Vfwo z#Pf&7ABpu(J>aMR^+yd( zlf2@Ycyuf{9(||rhWqoJ2z}c#85@a=C&t+D{h9F>e8V$25uWge9>tFs8S-Q7Lnq$C z*;yXMj|&WMa4~xrU-di^m2t>{MYTl)&{#me( zZVkk3kf|=}Us|{6IoIN8S2#iQCjFzsqHS9I<=t-h@RJ+1#EBw)zcdo}4^57Q{B5nl zp{*Sq&0AYqL&4@Pk-WpWQKe^iB-z;>3Iv*5LuZ&R~m8Y4uGrRt7HZTPn|FG*{ww<7w8w5y++TL8ZwX# zL91Udp+N>Zsew+vP)}u`m&%3ug?1_feL!DTMZ$)>rq$#j|iZmmg}-$G{QwX3zySmrEg3-hzcmbtXW!u>4I z%V#Z?bI(H7&RFNHX)Cj_%GSBG)yi$GpOLL8>zu`Ekxdr#ihi}L#ru#LXRX$A&myLF z#x`e5+gLv~**2H9*?2#;&&sxxZO&@5%B(NhI%}~$FKbC^UW-0WkhPFaDa&k;jx5l7 zD{DqO`73l3BY>T2R`HNl7tn*N_rT zy!_&PTEC$-olcYVhAIPH0AA6XoN1HAK>J&7t&(Om7Uzt`;->rQA{)20Ik;_+)0uWw zGu!eShg5ELNNJ&(+f=)`jZo`M*VZ$e$_BSoX?08Kl6r1azlqzFG&<9b9%f?{H%Uh8 zCMjL!;Wi#Gw<&9JrdwK>jj7EmnXFzZZD{2-ty{Q_q1~Bo?_@UCyS7MH>lP_p*~x7> zw{e@wE@!&S$7~$iw@D7`HYu(1aT{MZx6$=D(>=SGjq9Fn$z`=jY4VAyG#hRpm#!Mg zGgk}GmduuB_+^R+uW&Ir@PE$ zW4f|3=<1#BR3i>ljg#pN2IxH1?o>;yyP~d|=_)FqYpr#s+Uwo7HPkZQZMQ+!S?^AD zHM%!6)id3O4bb@--KidryGm?ix~eLjENeW|^nBk=s%}?*gC4(O9s3RX*|c7NnVxIC zTscb&1MjUTGqR7OZbp_V7R|_1UtYrXy2)%H`a5#v61Ud zUamK_a=odY>#f_k-sCceWW>ET5+bXl#DwY1!HeFVmPUqAXA|KE;Oz!kI(|ktAh55p>g|lgL?kd4SOp_ep$s>k3WN#f6sXk}G9%W#{$t4&BVWdL0$vIx5C>lqQ#PZji{o}5bjrxZ=F59SoyyHb1^$+d)x{dnB_q}DK{!#_^PgZb$ zse$_^8@Ru;o%<)ZbARaw_fH<-{=q`-A1mbkK|A-4*|~qPjr+&ixPNdD_mA!2{*hAd zpD5-2k&WCxv61^n?&ki9ySablAoouken_YnA&Zbj$l@WP(REm8avc>st^q-G z9Tyr!$RcDBvUptZx(0=2S5Rnig@nznh~O0=i;zXgVnmoDxX1Mdg8}pW45@UUt*mkv z-EP4ASH*RDK8Md~EVZ)5A*CD+_a%#4O1a&*+oEsdF;XkVN# zn97>-(*(a?)?7eeb1dJNUlQiu*UZc(ai8jZZ~RO*qA}@#dZ3Q9F6!`u3W0?{Jx~u^ z2V4i#k)=i5dI#E<7WL&0VuGIueionwXo8&yc4nX%XaQP)R-hGVf}d%#1MN$TmaPt+ z)3S1*4RLH^8j}O)0NUVZgP$E}2ReWb;2pp_fHuUj-Q__0(xT&c9K;DfC;X~`)j%ig zoUn5NUBGH!HLwO)19ZaAxx<0>rN!!dl<{QcVmIQrAJ&*^fwe$4{M_){2;2y)1=a%V zfOSAO;<)c~pnYkv_I?MchhIJXHUT#Q>tR<9y9Qtba1(G7@J`^Jze-2v(cJDJM$1e_jzm^;W_kQ)gZpzV-lDtieT zr0tkXLR1f=<2qGbZCB@aQuM()SmsD{Jm-sB>~tKbT-|nJX8I`Y@4Rcjo%VC8BWNd^ zm_APXIp2}6)A*5Yz8=NvFvMtS@MP z7b|#waznSB_H(g;_ako)+G+fY+j)HXNXky*UOd9%K2}(5??Qg)DfkcF>9-Kyk%{k% zX5ybKh9{;gDJpQ>oJpOcPwcW?!7ZjfUQ^a>?;&;b0@h_C} z_|qGC{0kd-{OP-S{0n#U_|pe@{0j$p{FJfU-p%3{7t-@kSec2xEt!cwYvl1$O+5Z= z6OW(T$>Y!NLL8h;O_k+-tln>^43t+-H1Fpn0?hd9+uAUW8tRUVKjIGhPw) z7@rsR8b2%CZ~VN_Cqfn>i;%_73;pI7gni~0h5hC)2@jZG7WzfVB4iP=__A=o{AJ;w z`45Cc=06l3G`}t!5Fv|@Mabgo!eRT@gd_H^3rFpLBs^sQmT*{vEJ79`i{BCk?B5oS z+5bd1ZvU>}w_g)) zyY~|gv@b30{*;6C!LJW~dx3j_eX#3;-5%f`;9lTf;Qhe+fqn4n`-}taON)DdU&#?! z>&||}=|@i958MyzhhIPZ_5t?+_XGC>9{@f8>_?pbFFMe^w7CBj2RQ)01MoWpJOn%d zy92O02s{Wp1Uv+M5cnYQ0Q?TT>OlL_;-Rl9BX;CCE&95?{G0oWY_9s?c+9tZk?e&7K72EOA! z`_kg^d1X9#*4=}MH;9}Z1O|bF@E?SK02lxUfkEI9a0obvc!S?}pnYjE_~#B1f?o)J z5nu!uf?Wu9VPF^-0Y-qsz+qqrexbi~pnYjE@>j~Zve*4Ir}rwky^pQ|_`3cYK0tGM zKb_wadRhc6JC%Ma7JnDYXb+Xv`e%J~7y?Sph~hkOf2Y_p*3v5_b0v5NVb;=T5a0?#gXiiO@eJZ=JcF2pOrJp*mY+cw z*fWSr$}@fjtWw2yP(&@;5xc9|~<_n2pe9ucw# zS%fUk3cYslO*{Ce9emUNoNx{Qbb)WWz&Bmso31?kvj%*#27I#ye6uDG|EvSwtOMVy z1K+H>mVeS*;=}sihaA!k><0R<-uEFHZU=4$b_2VCJAgZYKCJ(JXB}u?TI_yO;Q?85 zeGlUFAcyP%?gI9}uLpiRfjfb_fV+VA0Pg|zAWqNkInchexNA;{m$g=4{1iEa@l)Vh z`6(xxY=tgHJ8wM*=sE|mr$ORz2;gim(aMkvKG5u zE}`SQm9^USatZyNAH`Zu%_Z0HmH*>wx-%6O@-Gy=u{y^nTILuss3@Cb ztdym*%QCaeaExWW!ZB9L$!q;?mSZ&H{0=_D>yCq8Sv%*T{-ICvx?AXtx7N%-sK4|D z?k~N`{nx@tsDJR2+&}ml_g@P~q5hH2asS9Sx&K-?3-up;iTjU!5B|U99Hy}Wb;btN z85>Y%Y{*l0Y(kx}33bLM)ES#rt2?fR18Hhy4X-O+((pQ@QdeaB1$D?RFDUI_TiM=D z?aOFKR7)261owdcTZE?uP@M?ej7J?f7$RTsgXW~Y0w zrrK3dQJ5xAG48a>0B&@fVOj2lT$&~ChFqQ{_dqV7vgQigfg7+|C-5e@)N&qUTnf*PS*;UzlU!;!o|-=ym%?*jRs#jz6qkBb;Znex z;!=+*Tne~WF2!n_XO!9|@TO{;Pbjrb;F@cjIb4o$u`DjfIGMWsa`GBvHkYHdyKF8u zNM)?QNEg;`c&ZViGB_Q%2eKE%M=E3fiBiP;L;H&tcjt1wIOA?gEiz7Pk(}!#X@Bv3 zycQN8NvT}#B<&}D=CYl}kAM0_JLzH^@D%MQ{>Il?tutAHerS!7ai|N71MWhNPXGs` z{Uw!}WKyX~CXeubk`MEmTkGwH$B#YE&MpUlKhC^gB1Qj<&^ zS-Q->Lnh3>U)T91h+GS+Zn;_j4ul`ANC<1e&jVq7ix+w)D&H)DZ2917;8{dtU*n& z1~tW+JT=BT)D-JbQ>;TxvF_R$qtx)%!brnUh0%te3nv0KczLKY#5ki~adjq+Bx7R%AmOpg9YYJ~Ajyj!givhn3vtFqc5ax<$P z0@qqQWHmzMW>zBv-c*ecxtY}nfgy1AwbTeDJOhxBn@548Knea5IR6N61UL#D1)cz& z07`fUApOjN_NB$qzj2T#{G!OsF<=ZBgX8B1IK|;_(lKDf%c`v*x%z> z0?pBMp+ldyf#d3sd+5J4WY61BOI*Q`QA=FGkq2`)awx-*S&o)?t&!FrSdNY=wZ=Rg zA%{c9SK-jfRXFrXl|$1tH02O#j;6K7Y%Z}{j=oWDP2=a|7}v|uH2#g$MQHqd9OQaA zn#RA8Itq=SkE2{KN7MK>Qg@;8Ka9huIr_JayWc98Ia6FLZvyw1iHus}8Ppu*kN-YC z1|MGoA72B@*MR!Y`g_U_{yinzPzGZgd)FU(m*@-U*aQwFso=kTkRGSIKBZl!=p{u~ z#0=nNb>|R$JJ+eaD(Y-$@pwI6yvOLQmy*f&M5ic1kS2qkq46rWPK`Ci6XT=dp`?c-ivHOg{V_$~j^$qVmQeJKIr@~MAIQ;vO40Y{=wDIwcd(u1|9w}{M|13dr0D-k zTsCDpzf|-yIdT3)(R=ZPZNC;z1AFXiaX z8mjnp+Mc7|r|5IkMq95n>hPnl z6@7PHW7O8{-@MYO8IhNNz}Br^e``)V>>rVk+x@{XCINqo*W2R95V?+yPt|01-3*Hr zQJZtawyPns!cwoCxKx!LxCO`en&GtIhv)cF)e1lKf1_Wx@Wdv^{PYtNYjmNXpjn$s zFr1(t|5&phySPIBU?7ng39qHktWS5a4_~Zdxa>n4Q$s^*bzz?!nHWk0l0(uOd~ui4 z&w8*?;t%Ulut!cHVPzYdfcF z7~zQz|Hy{F=g8jed+yz_G9BNvM5jNx#AVYFF!%dSU!wUxYs(RtFI}6d{_6T$#X-fF zuD@wt_&1MiQ`hCF89i~1cI^}A%t-yc)` zat6?*__Apo`|3O``?y1#1OL_;_SNHOQ1o4jPW69(kTP2 zL)AamK3Cok|6CpIPW7kz*;}q{0FAjiYJa2tQ;NTrI%RFDe@9MmRaEgbeAIKF})f;-6UxkXJgW(L@N3n9wLQjCnsZ>dn7PCXaduNCa^we0{4R^Xn4>BEf1QY z>p>Iv@0sALcFOegP+j3E^LF?-NjZTwq5{7t>IdGbOnP=k%EqRqW2u}pnRqr9mDq{+ z=K=Dea!TXm-EfDJiK&#~_wKMWLGZV|$xv!)HZ&D``-z!Y>O{bO;)4y7bUBvECeo>! z0{O^09eP%pOonpOGwIAFwF9h49WH`YXgc=EL20bDV>%s^eAyiGNzwE(so>?a68j5I zD%c{OZIKd*A9`mpiOZo}Y@?5xUxB>i&2C(V`##_#JCjHz`ISlD@@Ai#4yTi!G~Rc6 z-b>MsytApqWI8jgbYI%+f64c}v)Ncyiav)gk%&m>d*srOaCA00@n?VW0cVB1c+dMx z=yGf_lg9Y7f%JbzEhsq*eJu9_u&GF@3BW$6G)+xYlh1ay`?SKN4Q+{9%}|_*HdZgI zOHsYl`^LZ3@BnsuGneX#UoFIbZYHD5PAJjpy;g4-Y9BlH1#*!Pk4dp5rS%M3Ul(87VTGh)Qjt!)>9quC`b>7VYQ?`;=~Y zrxLj%UC~guwIkfs(b64i@9_CLyIMP1yIVWDecdhHk;zar)Ey0n+YfKo@c=glRo|+5 z|JyE9gwa;=9$CO3X!pDA;{Gpfa*2LbX;09vvOdW`D~Mb_9c7XuITKteJO9ZbDjM)`KsN1?G^CS1#wBti%N_ri%WS? zz-j3tzvSZ${ zBsxS{X-gIt?c!^4DVbj>MVnUeQgG|MebJyJtL&ONr4v>GL>ZfF)r-|;x>7=(Um@fx zE7ljyiS_<$&l_rZY)vhXHCWwwt6hoRCDz7w?Wv9D zsh!8z9Xy7*+o_uYYRgoHU zW7$GpUM{;@xmY=0z4&}(rDxt`^61T#abgO7s@Ln}=W9%!d25ZwU@<8+g8?>sjc49v z^-x=lVxtr`x79Q6v3n{-t75CH)XB17pJU@;B(+AZO+DHySK8Ds=JooUY%KJ0^&&Bi ze_Bo!WIsdQf-EyMEXd5htCHJynYevd4YyZWxxLE4?Nx4WukdhtMFY22G%m=iL!J-!4Zj${Y0Dr`LOK84T{y3VqzK`DlTUa-XdEbOk=0&RtrD^NRUU>{%b! zc}>VmD`i*97W3rVGLg|1d0|;!E|liUl5#u^vAxJ|G-7zer;Zx!qw#(@otpAzhij02gz zy`sm#^!|z=2h+Fj4Lg|Lzc=Gx`u6U52h;nzZ#tO1{meHVOz%JQU0k3)`?$-@^}S}U z?>fl!y$89z>p0i<9_RY5^IYG1p6h*OTz{;L>wQkHKj!3mUkBG8>)?9dX|6wZn(I4_ zT;F5l`p*4a-?N|VJ0Ioxo=3U9^BmXroa6c<7OwBJaQ%@cuJ3E&`XeW~zV9U0AGyHw zeHWlNx~bV%N2`qm8Y8Amna1mVWqz-czb()V*m-#4t*nEsSL~s#Y1Q(A{x5ZEQT6$=mrI+rr z^ij76E`p2TVjuNb`ssel0IjzK=mE2rUKXn3^Qa{^gxdIgZ7r_0WPw@XOPE$C-FvXGhU zB!<82D)~g0CtqDAZamMM{$Z_b&+QF<1iYev9|hl4z&pW97#Dtdj!jf$VGeD5RjIQT zH|Wa03JWVaT&E`g2Opc9g0&i04K$JVHItFek=xhIz-nMMa1U?~&_vePO#58$udh`f zaFH75YM`qH)&gq~SA#eU&;qOl)&d^_J_M|RuI3RJ{OfDA%`RTk3TmMhd8{G9Y6seZ zR_LtI*?=~n9cTxNKoMv~9&4Km{`EEcVHa^g=YY-ybO9ZRb0E$MbOK#K7w}==!$1dg zjxV|3Ute>5S?y0jEp{W1J0@7`fOSAObZ+SO0`~&zfOWuqzd(1^V z(0QP%2i5~Ui1Q$BKX5;=9#{`N06YNnK<9bf1^@b5{S#_`wz(HHAYVg5upR;)0yaS3 z0R2JWLEs_aA>bpxM}Q5;*KpPa|N7dYCtaiwx<=@{KrgTnagB&;0yY7?KrgTv*bHoh zuJI`s{OfC8N$pqRy~yy|Z5<}sGDAFr`B8iOQS-VM7$UNVfban9fb6=c-#|| z`0egthY*MBAi#8plNrZV7-Tx!JII&78weR@esG!bMuO)(_;D{8XLca#SD-?0YWH$t z`1})0F)p(Hj8;@TSw8}M>zu@<*rTky(Y-z=YbVg%<0OX^dz`g1+CAiC`2%OdPVyzi zo@DKep2;{_J3%wr`Qz&=Wt7c(Uc!7~?k8OruMmp41Z!{H%-ahdtaGw<#t-s#g2#KD zEdTg%o|}Yz&-1(kWrDK{_2F2_Pjy#*g#6W;`Hw_5^N*DA`~yy&f5gf24|MSS zBON^dz-gX;GmwyYffKU%Q$AXnZsOn33ln+|Tom?dSOiALaSS9_9H5 z&++_Y=Xm}hi{Lz}viuf3Yk89fMQ{;Z1Q*|=!8R~}dfxV3 zI%fMmeaiMB9TmYva1mVmkdE6vq8Ds`KqqW}NG01c9T&kxa1mTwrmVL6ac%oilY4+Y zKtHZ!Kd$BDz~jIkU=Q#F@C4A0YukU-1^@b5&kHWn3tcaCeZW3oFXDO;cM^CK*az$b zJ_dXY*b80nue#u0U+Y^^Yed1kvmbf-QIiLN1HgXh`k^}oJOvy84genqJ`U_hp8l`7 z;9p-G_;nWvKo@{+5I6`7ATEHo)4A z$wR;)U=X?>bZ3BPfJ49`;91~VU=VqN-*UmfzBcrhiwr|I4BZHD1UQViVZ@ySo&$~m zM}SWPp9Bs=H~fwZ{`Iwy_tgGuGw&WnzERZVG2j?*6#7x<&jZf`$ADwNr+`lZN0D#z zdoK9b*T#O|MaH2Uhi(Em0USr%IN~km|Dg;1^|gr=wO@tvepb`{ zYHjaja{xZCzvFp;)$)FJkLUA!rM3syJ(|z^mH2I*7YM86_~C@j>)CUGQrm~wy_7#6 zD7Af*-5dFHf>PVZ**$Ih=Y`_h&iYgM{Gio#)?VRrgjU;GJB7~^T5T_WuDDxmXYD=6 z^TxeuJIntd&m;G$?JWO;Jg?lVwzK^I8_zRZZU6uFT=SKR3c2dP=2v7QFSFm)aAkh+ z`C<&dI0wEs2UhSCx3b+PtVNe zV$I3q<>{uGO#10qBj z*IDgt(MVfcq%9Qg4tE?RfkX=H6ppxAhXR3KwHxk+=43iG70XC7pi57 zrfb;B$ty+nr&Rm*@vL6B&8YUbi|kia`(G8=Us3HpF0y}1wU0WLosEAVsrJ7sivLs9 z?y1`dRd)VHwf}R|R{OeYZ!C)cPsRROo_~)`O|x)H})7 z(&CfaH5==*W+@PLqMiF)yW&mFPD^ZY$POiJZQ0Hw;aG+(>)EkRIZ9C}9Li)8v7NM8 zuqa1a)3Sr^DobcCM*sLsS9v&!109qO>a zF9B^1ch_W+uvI=Yv&zz_jnksYrX2So+s=;Z4tjWxRWwrX`N89-Po3CO*8azYm47yM zQpz!~cwOcj+5}5&p@=G4cF))J+PqG~i>i*z>sVW;$jYY8_jZH*JfBZ%dTm~(A$~Gf zHqEc;RkJevV7WE_MRgviVX_<(5#RxQm`!Vc;|r=C zmEkMg{DlPj&G<0AHZR_oPZh5>)^tT3=uoxnJ1AR^igg^dd4o2u*2d4Ih8+Vko2H*p z_1gO58~FXLa4Tvazg-sTwe`ws`^E9aoP`(LSbJLgY`<_Tjt#+CY-92FtG}%3TbQU| z)AZd%$u+7Wn@0)-I&Js;eSFWV4lEmQ{BpT*dT8hW3$STp&lUXtkLx-9Mxhb=TDzM0 TW$bLH53v&y+dmaKi}n8vs$-Lg literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=False-STAGES=3.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=False-STAGES=3.co new file mode 100755 index 0000000000000000000000000000000000000000..312e8cb4ba0e7685cf17477aa117f96870721829 GIT binary patch literal 13624 zcmeHOZ*1Gf6+cljExW2M8>#F3YnW6?Xp&7={5|Gh}WXbQ@X>u-$#%hb~`Mpct|b`#7Neu%QEna_^CM zvLaJWG;bY>=?wARy?gI>kN1vu)YJ3%r=~v7`FzO$af_eaB|ch5lBD&IVs2%s4S5r! z1Fk)!oAAI!-K5*Sfqq267!6W6WV10T>mHJ%f{c=|+l_i)0s#Zuvh`kS32vLwT0OMY z8le3r?xkh7Nh}n?QZpX1KA?IZoN;;yqi&0=&l%O@pyNTtW8<=UIsF$igY)vBcH(?s z`3mSU(_PMkU2!b@OrXIY^a<)DMro6@gKQKU0??x;C%i&oXO~lVMs<9y zD66@OoKtQ-T~^fNQ^N7vRhBfP=te z^zUHqt7c5zb{n;3ye^)VXOyh2feUahlgspBH(~Y_^KEG5(&`Wp{J<_SG?dHECPw3I zQ(&{}qdN_393BPtvtB>!E_7dic84o81f<6gI~+R~sP41KC!N*(nC*v1*=HNnlK~$I zawLho#l>^4{H*sUQM>r&QQXh)jBd12jQWImbggiPHkB?7`jEy8l#uD*FERoL2899+j zrA7~X0nqbmlJ)hl%0`N!LEr0S0Va!`R~w~1TO7;BuQ?WT{2Jbq1=N`mKfl(GZJ+`Q z1o*Wmwt>d5-Nvt>calDCEeUjyaDtCmtG>3uxmCjLJHK{i)j$8}QZPW)9){}Z)~WY767t*R$18+z@GfzE zrJdYZvD>y?gXK>Ai~Ld--(_`M-Se(&SD@T|Wi{aMoU{4BKHIj*b)FaOKq!pzm-(&; z-yO;?a9zU1u5g6!3FUczXwD*wDomR1%xE)rhim$)mg z>=u5D^@#Pz{5%Oo`Y!WJ7L3o6f36Vg=KDO^LTtX>!iTgScy=h3$2#`)vp#TKwa;;4 zO<$;{59;fByoK+0b|{oby3iGiY$d(#E)EjXe<+l<$qya7JIN1L9wJv^-g~d%?$Hl2 zFT(gJd(n}NMeh zm&Lh$z75{y|N8FebRS6o9oPEE)@2j=X6Lp--;=;Qa<-3r4(Jo#?;}a_=Q|TU9EbGD zN4U0a6la?U&UT8ky%Ek%1 z*Pl$jaVOca4cc$qIR?}wm!0(xgSkvz*h5Bu9=o&$o-^~VKNhg)+Hn1~W8GgB@RvaE zOYpd_-Sob8H@dGHf9;_-dpvL+r#O!{!m06BgyKXzaQ0H1y^U~c{I!qb?DN3cPjU7) z!m07sf5JbX=of)LN0D{E%z$5_)c&Z4{R7ngfkyV%_~ih_IpBeFkm4L{gj3^}L5efz zfipyLh8p42_(h~Rq6f|}#Tjmd^J)6UV}C=xwDvcTef!h&OFw;HYW6YoOKTtdAN!@X zk5%7uTKky$zV}&vPy4T*`>lPf`n$K2>| zuOzY7_sRxwr%&_yOKTr%?PCz1tBs5PzsJV}8n@hC8@Ci7Zt2FA1&k|4m%j+H2*;FP`fohk@c8V;nd}<6+|vS3W>-FdkNqbCBX- zJj@e^#>2+pJQ<`o7!Rw*8KO8C5A(#K@vw1-D~l8d<6-qU!xRVOVV*d2-x`Ow@&ljq zM`;}Ssf}^u`}yI!PW;RD1IG8G)DHtd(GOdfp8-F7!-?;Uz&r5uJ~9k+@LR4QqSOzf z6W@1nFuq@pvz_8#eBTp?`ayK!`z{W~_v>+XQXGu$d*V<(h)#Up#liS~Joxl>Y5wTf^!fsXqKP?G`G}npFzPdO{krpahzbBgErk zOpHMui4e`3X4yD0JPeI|B_*b{((q|HqiCv=Ev1#p6;l~57H3LBWnFttNt=SMrH2lS zfFQ)jVL7Rdn|VvO_*DqK<^6t3=9P(o#mraG1s5% zWTB4#7?ro`_@cxA(4M6FjHD@_wHdum`7}+6vB@7OzsjJMuw&?lc0K=O1O9vi{#pY* zq^dyKh<~I3f3^YtN(25+8}JYMVOTCmESlHL?(bJ1@0Cw>7x-TJWZi({l}|=QRWlVL zLedZt)N-0Ihtj>Q!)492fl#+G5pfopbO;)Cp z)7X^JP7dWvI^DdIlUHh_b=geko6?2_hgqABtuR@bx7oD5uB0nYnrqXVkcM3VZ4>t3 zVv%6JXSw3!$yOIi9nV<@b-ZRh+?18~$)%Jg&wTml$rH!f-mn>p+aDgW&UOT}K7XF& z%d;`6=Q3rWE}jicpPm1Veuv88`Hy2^S{oMjUeFEozuUIP^x65(DCR0+Vf{?s<+{fd zVzYkdX0VgKH`MDh`x)H@qRy6!I#AgA38U=&#O>XfTe}}R@V?miYjj*jEv86mvaUT0 zZPaJ)Pt|j`!xgjymsYPRTvNuNj=FdcSlGG2{_mpfva9c3Z2lls&?!&C6?shm1A6YT z{}TyU4Rht|h9@WM_0PC+Out^fzI__R>v@R4^j);|YrU)tUG+Saeo*}xsvkq8n!@xG zb>OVY=y}j#zOPXyv4d`l{l5^-cQ1U{{Im6c33zPoT@Cjd#YAx^H(XM^vT+&XDpb7m N-=X@Wb)EJ4{{oCrZzKQ! literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co new file mode 100755 index 0000000000000000000000000000000000000000..25a74d4649864049483ce5c28db5e71216064bd5 GIT binary patch literal 13944 zcmeHOU2Gf25ndk2BPBEXgOkXP9iJV=p)Kh|()yzjq>Q4pNh3FInxa1mdOaSGKM{FH z+#O}PMU{)J$U)*LwI2!x1=6BGniQ#$82v$lq7vIa7ij9221pC{p^tsYLyHzD!p_|7 zQWSNJ&8R@qmMa`*cXsC6pV{48?an=U|JT#O?+3mFKCP~TxheShps23Y{1!9kFr*4XD^%>RUAmc{LW971bdE_rv2K(hk>q)zp zd>-_$ymkYYh0ior*n{3pjl?KTk`Ty7At3<0_tflXPJfY*bFa&#Ox~2mdDYO=qIvO8 zWZ*SX&Xo;G(JPuoUKV8|XTG@dx+rHdhH9FUS(X*`hq8jPxXwp=XawD8Y%@KIs*M`BUd zauqqJzWQ)k)$W@U?z>WFNeilB7If_rN8S=k@}iV2%9eCSH|A*tu*d+6PE9VU?+!~} zO~gyODkV${@T81>Mw5!VUe-=8d_ls$(^-ecq{1)6vQb!&Ewx_Dtwxa7M6*6aj$CmQ zX1P!-(n<4*Xr3*l_2Ro?8zqYKnYYA>R>itSwjaCU|*)z>14N{_f_j{NaYl56bL@B3XG2CvWv;_1X~nX z@A~LU0}F>o!1Zjv5337Z*WWtm3XKBk_rnUu-V0RsapaS3!q#QB9wKGWeu&S)S`}V) zd8?~uU;e?sOVQuF6#ONwXZT>HP~H!ZXK17INgJ|YC?ZMuLRnHOg^V;lbt0~$<4Sxg zJ2er{rai0D$P{x0>*RDsPRA$G(-X0zJU)>~Oisrq;>q|#GLeiWm8_hRlbLjSe5z5A z#p(NMmZkeyWG6+@u|VQf*_G`HlzT612K?PiH9l~(R;!WAJTKG&kv^1vhUbU)5T9S+ zBEr*=o*{oPpZELuC7Y~nkiN&S`NKTlhcv?THJ-O^tSqMcD1Yx3zpobh4vv7n8h;6C zm=D`sp{`}_BENRabD=dT$98yj*K&=#;M6}%E^Ks>=Qn;u);3l+Zm4gKU*j&gvU~Zx z_RaRq%gclxy5$*u&4%)M@~;hIzxtk^>?JimWaA_a0k6jQUzSKoTE3imG;te zWTJkXJPKuw03{#13sW;@L45|-^GpByadByo55jK!ukVb{&yXb0NqvUwTel!@acLjq z-2=RnXJ*K!fZqMh8ImG@zB=8{aY*mEnG5cxIQu#tV+{p!Rr#QEJ;2fejhg#trqd22n@LVJ^^n9RaD0EXKZ%2CaweD_b%-egw z&K#FV8#!ac=gwK2+o{x*t0|cqoB=wca)YVVn^#j^`yu`2)fu2Q@{BVcV$heVb4N%V z=*;;eBpA5-ttW&_8}|1e_+u#6O;(V`yUBp>12$e<8y?uQ;R4uDpluaA+IpC_^>C}U zHrVh8#W~`Ea|gw_qZLkr4TmVskO$746z9%XI1M&DN^y>Q;M_%V?rMe8V8j2E-P*R7 zK&ItnK}LXMb3qU+w%4-B}Mk_G1nIe|7#}rso@-{b7B6 zb@qpy{o(b_5!-%$b@qq#`So$^4+VPOdwAPI}{fO?@*d?ZlgGOhvJDt?ISvOC@v1(p)}(hq&Rqo;)z4;BRY2|E)L$I zG~*niICzKRi9_upI`)a&_`S)8*vAL`hW90Ypj-DPtIqu4eeV3>{oRCz`C}6s=Fn4c z-)487P9Jo}!2kIeU_OtFG_r@d0ev?xFn7*q$W4PQgcBAi{o4mM9ixAs`siBV1S-#x zbqdaof`VRT0(EE-5(zOT#=s9ph~_P;Y@Qq&gG9cP78SiT_K2KObz6`GryrPUCkV?|xdsfJXRtvq_FNv1AYdGKU2$|X}$WKA+F z<+5&AAfe35xq_C1d~je>)0Lc5(6YJ&USpCB4rsb5K|QmDqH2zrhO({ZPNI$K);rKU z0@v&ipwpQ@FBNm`86qo9{M)I#-NY9i{`<9As?Uf*`7F)oWy(Lm7?l2=@|)PC&#CD9 zwPyaQ7X0NF{Avrn_}_2A9|p$}!i7nd`bD$${S@TA{PC^< z-^(Ab7jV4%@rbDDmP$l$Is|t{BIdM;n3v5wp})wt3?dqOMz&-kTI!Q&^#35##~CICkhs5 z6f~lMZ(~6NkK?RfEb3>lBq^PiG+lD$QG8;uEzdr!Qj*Z8vTY^wm~K}ots3YR`w=vq z#$}|mY#0T#y>c78S{quBjNZrR3Yvh5?6OuTEW|v(WFgZGA&k zDvq0Q+nV6UUI1+i*5G21pucRn;`kZY2TBvq83#?gb}igiEwAH?D$Rc7bN8Nl@IJOS z?1tj*lfR#~yFH;8U;=6(Av-dxvU#D{T{>QS=Ef53S7xY5>@3orF z^x6BLQS|l1!19^C%XMEyX8`fUGHfA7-oYU@`y!))K6PRiiFih+G^h5bDM59@z6 n{?7xC^*u!EA@6RW$cuH38)4-#_C<(z>HBbCK^Sk!Y}WrbLcbai literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..bb21deb0997861220cb75510125be0f8c87d8e1e GIT binary patch literal 18136 zcmeHPdu$uWnIDlWa=D^NilQirrsNg9TFbOdQ7`LZCy`}2RaK|wx;fWpxM3)X(z4MD z6y>-%S6xZ883&FOr*&Z)H7=J+o8AR|-t~~;aFWtPiUw%@aEky@+Z=)d2OJJ4aP2)% z9B}4+-_8uZqWqxG`R8z1gZk}$JM)|0&Nq*pj~svb$kE3wWo7*Wp@)@xNXpnbLO(md zIzUg3mZPnQIN)y;aT6PuF>gP;y$%1s$T*5D9h!NW%(MmiSwfD)LvLd~kU&5HJ?e@u z%C0$@@}XAJ&F+70E6%dZ0&h{iS(a~hf+pn1vlZ%$<+H$zQp)4y@_IS`Pnj{JUP{!- z%l!=GGs@$FN<=IRKRqb;4)`>45=VAP?2z`8a3KR8dSdVs$Nrd*#K)Y;@% z99iGh$=@0+653wjc49I$F)jXZn;kO@{bh7AF+DM#m`Hx>k7kq8hmXpKzh5xL&m_}x zshR1QE#%GURN`!Wd@_-VKRc6th6%tT1!%+5iK*mfbNp0W_tZ=>-aeNBzW78UlboMD zcILBj{_oiMXJheH>hGen>C_pxTabAB3CJ7Kxk4w3exF{<&88+NX}=<`N9VpcH8L}~ zxwq8fqR))I8J(X_jnAZ~4Ds>X>dXE*IzN}3i;sN~9w9Xv2j!#B{1|*w(D1X3@6#)@ zi0?+9O`J)Nr)Qw{u^Rio%|?R^RD3e?BeH&*eJ?YI+6pDX(=$wEc&`D%Nr|63Nl%(h`Dw%9iCnRGy73b)!e0(cE5V*d*VX*4fd~ z;n)4EdB6YVW02P3m)7@_?7HP{_p^c3Kt5pP3+RE>d>}yc1+MDLfxv~Y!nGw^w_2C4 zGt$)Qb*uThI+~{LnqIe5w;HGm=>7ndy69iE+>P%Z9Lsfe7rqLdmTYh}m=7A|1ohx* zJ{Y9s1h4DCrQm8^uueD1TC7_M)LqrBWNF<3W%}XT0@wbfx|N)jY+$$jsnxOpK#aP{ zc~(4bX}ye`UngX7-G9-&;&eOpO1*OFCzuX87p7Y$g1Rc$)T|)L7l>%ou~05A1aeg? zQkA4v=oL$Oa(T^Ww^-d*-K*{wZ0o241MPtJS@)&&ydHjvzMj^))-Wu~vSmVI@a=YS!mhE=iJ7Q6XuUKv#uB z)Ru`~28U_6Kv2;G0ohp-h~xqRH5&*hm30BlX$>sA;`eJgKiv2G<%$|#BtU1V^ujR?BaP1+rx^2A?+M7;ZTU)jGRox%da>0SU?dj|s@Y&raqS4gy>9|<8KHWB2lX_{ zk(?~6Sy@(GjWVQp9rBlvik<#Q&hJ;Ve!t?X^=q|@w)I`0B@bG#ZdNTJbtBZEUwyqTPNQRy~=)tk0w0e^6p{c^Cr`h|~Q^hu2=qoqxFHrZcoediJ>^{DW zXf@Wz!gjwkXl`r6f|pbq{qF+!$ki0mz;94r&=w5;HAiyI&1$x}S@G^^)_m6H1+(wG z^&!n=4K3(ob(tq{-Y56c5~KcWz_`?D5BsfKajWf;7@yA@5T3+i~!D~rJ< zey)f?m7h0@!QK44MGTVu&0pw*1j-m%7+M($49gkX7}^<%3>^$5h7}B*3@aJB7`hpH z7*;X#GOT9kV_3t`&#;zZfMFfOAj2IDLk#N~HZa`DFz+Ye}<#2T9C82wfOBi1~DKeJ%Kkw0;r$Skd!Q-dme)LWYR`Z@=hCSOnyT z>!Si(A53@!5wJfI7O*TAA;+9>J(y@0JeWQ;AYj>Gf6|BPQ^$n>rcaFss0HjNLzq65 z7LdO$3H+c9?5C0c(In*`NQMRejL83ZvRyFhg4I2i)i*UDU>T229T&JSu&jZp5dq71 ze2SKF?6O5b?k6u(?vt0p0&pW1g>oLhK{-#{pq!^}P)$$WNKH!y_By~lUMcVLNeBMw z&2{OJZvo#^fA8~Iq$BiPa{DAljaPD5T?--7dDFg=@|7{Zax>o!;M-d|U%iRX!uV`v zzIx!>S2|yViO}I~5z}H$j-%iFSk>B179eA8n05&}KIJsbRDGr!7UV!n-UhUF! zpq^%?_VNsCyB#gaMs^jgA3{kn8Wu@Hgxj zlo}fcq;T7ylpnZpaE(urtOJHG8o?J5@Hv350bQhhLI*0e1MMyl=qp zZR4Q<>f3<?c<}9`M}t_XRkA-s4dkt32AWg}!@>wH~dJzjuHi#ls$L$ucPGgM%8>ePo%v zS6>($RG^P&QngpZu>(TSAskD}U01*K1^GJkyK}!5LlOy=-CD!`=YeMz0eyh>3YR2S zc#Qrh?(y8W|A{poNi6DrRgAC3%m@8X+*>-Ix&L_?pWn;}{ZHIiI-j}!RWrU?GavLn zv9)wQ-v6A;=asDgVSNS|e~9rLeaQ(i2&~seD5tITa?Ime9pkMx>)r)?9i{V`>ov&u z8q9o6z}HnepSfOlFut8;zTLprjeL^Pm&y-Gv@c1LrThpzFLz7O<|L^c+R@$WHSc8W zRmnyBiPK5@3DoQJPEovZRe=)BiD9oMp9W1U`w>vV_KWNH@fd}=p#Ui;_OwGn(ioY#ZSo>GC-i|5w zA*Qf>?cbtqv_!RBv{TWeQP8tr*7x^lwY`0jTwkA>?dwy52e=+s`|sbc)pkcCxoA|) zMx#owmFt1Ee@hFkqn4b}F@9T$7dfyIUf;mKHVJ(xTMv15Ml3 z{w?ZDC2NAU|6Mh~+8;FK?y1MIUC*|)e~X$2J;w;F{Y$DRSo_~q6RiC~(=2k`Q&+_{ zU8|NBwU+l;sC$2FR0E&g$9H{Ny|r&)yPl|4YmF`h`5X*B@^`gp;5Vo*XbXn_dLp@= z9yQz3qXc_z81}IZGpA_g*6+=t=Y07UHi*+2)}%y1V&g<%sz zmEmrN%?$T2j4<5Gu!Z41hEayC3}X!27`8L)VA#p9i{XBT-3)se_A=~ac!1$ShL13O zl;L9x^ER6i3*qhUSr!ZN!VGGxzoG4MTQ$Ttcst}aAAXSlZ4=uhw|OA0fi`TPhOK{X zvn~cP6W;EiZE|}}C&Wy+9oi|k2jB|{bDQP%s)GR$G{;H zFz_hpzqJX={$pgwz$eKu15c8Z2A)P9om^aVKn&$q7(*$l7h2A(&00d9#u&%3MTi+d zS#UjO#32@09AYsHu>`}e8S#ik7LT|%AfOgVKZ)^;Coi(N#KjQ-wfPB|HDVGM(}Ilr zmzM<83igbG=`X)5Gy|Tz{HlQ3!G5*{u1{b7fe^#=*M2PEvw;0vC#JvlbD;;*U;9lV z=5i3zU;Dja)Fs4R3?GB-_<0&jIdR@=#6ilz=NJz-X$iRMWY)sHWL>sHV(2#|vd)o5YyW*>|YEbMK^$vapR}-Ok>mIx{z^&a*dP z6;Ru`n^fEE=8px`l-Z=3&Tdjo=QgRP^xKv~S=iRm|7YH&`ks3`Y?OsGB0 z!uan*|3CKu)xPin^E>d+KF88^ica3QXrE*Dp4)cX=a~JV==`Yq+@iyw@48nkDtL!j zRA_rFD&IighmG&KAULh@8qZ9-P`BE*c6~I?}at91;#dd9bonljagkVVpbTd!kAShAjYgPURCJ` zN{&XrwxE>vhK#k2>rjZUbwVMF>j*t}RfnX?`k+*4^)AQ|H*i$KPs4x*V^+wAF{=XK zF5v4Zov+cvhcPSU!FGT@a+b^?$Y^oF+LZpmF2^aLu~%H zH^ZO6n3Y$lWHBqp4ll&4hP1_@VY$gZ99bS7RzBLT3NL>vh2llUgd(tqByKz+hNf#W(8lP%Qyy=+i?B3Y!Gql}pw}z&26J+q{O49EZHrM_#YRafF^b>|P1QSa9rcMF=mCl7_%zq-V1yO zO6N1zE5@vluffc>5BMG_ozGma7_&k?j9C@*wF2K`$ajyJm0dF09JV7~uL7&ra#pW- zKkp}A+E3v321Y*-pr80f#ep#^=pWc8N@?>l)2g7>O3+woZu6ySHT$NK@im$G3T?hL zKC^Gaj88T57214hd}iP5Vtl*Je1$e&8sFV~gKfSUcyaD+Vf;p$2j4gI3br;%vX}Znz8ex1Cph3m-Z;_>|KnDE22cOM^24-j9PoqUJ0GxF zN*=}iB{JLZAh)+iW6>D=j$dxpGMU-AzSdT_(B?;?qcc;j#}i}8nQ3Dm%FNX4d?wjC zIeBJk-)ws33(3)poSqro*Bu20xqW}EtyAtE>50V>ZM~iQ_qVtAj`VbP_a?e~#*(ld zptECaw4-CRBQerD(zS;iO-;i-tv(qA9X&e8WXR2}lQYv3$#i@+kEiFj&yd?pUNO5zFFoiZ~Q2c6@o z$>dz?TzYhia^vCw>n1T{}QvGg@C*Hc#+xvoeD18y20#D2;7^GZ!^1I z#Pj#e{`I2t?=riu$o>JdcQzO&g+IS#_ScHi|2MOf#zIo#1mX&C*h$N-t#()|8FsNK zJ-kmE_6&X>Vmb~n`@d2L74Ce7^Xw~_jFS;&KUc()X7;N^>6e+^Q)K@Nv&W)a`Cnu9 zSBldA4YLonZ%zMGW`DD5tNoYEezYk4zZmx4<@K$}NeE-9+)4-LGs*^=p4<#E^)IRY%fer%}+rZm$#`2^cW*pK;v*yR>6y4On77B;=>6$2etQ&l z?4_n*qwV<2_^apK0k@ja{v0Mn@kohq1wTZZw_9jB2H4g+T##&>9k5#t)VaHp{0VYX;ubCMOAQx}BXj_9+)yLy^t!caiOZoLRcQ`|N0q5B|xg4?Xe8 z!&^u0|J@Z%pZE%HOlN?_|G&Z0I8T_4qDvz$zB}^#e0{+2X_gn)2Ur%&IL5)(4X{sy z{AWHt!}Ig?0Y^Sh^ZPtMqcOHOBR9VkF^u^c5A$hS48F+rm zcKdH}+63Gxwqfd0^Pge);U`2}4xS(4i(3wU$??l@m;2tF@O$tZBRq80(6GMo6_{5i qZ~r;4@w$i8!V!Cfo?a_h=m{^En=ix3o$|lK^8buCbe5qw|Nj8Z`UcAY literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..b0810dd6b65dd8e332fb5439617011f8e5833838 GIT binary patch literal 11272 zcmeHNZHybod7fJ?xyvPQNu78UNm0DL(~3G#7ImMxJ0*1~N^%rHb}q7OxQ3F|lH9wc z#V_zfwu|JvcgGwZL%p*MJCKYBand?Y>o~Cyq;OF5c<7HJK$`-B|5$)q=ugq2KmoUD zi~K0WzOy?+@u)9e^{asAu=mWoGw(C+zVGbJEN5Rief9~auP?zOE7{O3)JO6d5+wiY zgI4CO9m{5s6aMc)9^?Qq)=gN|cj4~{8AYD7!(!Se5p9D6X-E-yt!k_XCLl1tO2O@z z7_`b1JN2-wi3q8Gd^?t`BHE%i-_ak|`;h2;_>|R1_;6YdLNv)JH!-~|#Silm4*X!}(tNPs7vJoT0NKR{^pXKFs9SJe1L ztz6J@l`H><2HuaWi>0!XF4hWF>D{ZDgn)lk0`H>|1Rj+DT=i%^7=2WN zu}3ACeJH^yjd;Rp%63NlC#jbeGyd&XrykVmSMkMiu~t$l+G1WSRF#G7OIk+3KK#uE z^cUucgt6DIR&&|K0{_dqwzYCri}Nh_Q@E1mqPEmAIN1l=j%`ZGX zrew2!884Nym!P(7;+;D{@5d|chbaD$Rj8D*xt#S}q4(mIm-DG&?&jfMyNh4Ud>pS8 zvJ1s>-ZWmhV}0K*;dubxM}HiQr%MbWb*C0)yAl+^U(%+%EMOlEc>eN0`LC04{2v(@9X88tON zotm8Z7u1Y8mr13jX6~{a1Q#sf8CpmDNsFDh2MS;7 z`#GwEGw6ENS^xhrZ}d5~OuHG!7QQC~&|Jc8j;#PL14_67>)0B_Wxz39wmY_P3=)0J zRsztlGh3$s*AOH1A*0!6k1sV56Mbdta?@Y;Z}?mO?@dikPWlc1M$7MiV;g$1xKr3MC1&qK(D-f{s0#}XIKw#}nC^PloMz9q$+XRi^Mk^S!+61o|!Ij`f zAQ&+G0qAwvzroy(?*SaE!Qk4Pz{%7@8=+Rn>?dS|Hd>*O)lcYcBeW9Q2!?`&+1GM# zJrKNV*w9Lofj<3EW}xg}39dJ6XbV5KzqT=30Lb*sns?pn5t@DIJ57X^oBmfl>u!(R z=r{UTnuv{Vu#RY*Wu-O5n|@{Ok)=SxyDoSI!)3TuTIlsn2gle}8D6rZfg7uF2$hT1 zS9o4V#{6e2a@!3H*%ihuamagP8QOeD6lI6Ut*^M7VzPQwYd{9qHJ3j4nAZ*u7Rru}kcy@7nuy3a3P z76jSl67`wVDv^HBpZQ%ivGn zHRy{qe3;88_YeAX!RA|Pp+7PEhytJ~ASU`$zN^dUq4C2-PDIy?-baYlK44dMG4|dqYt7 zW8m#WmOa3O?HQJ04M~dDB}o>CC1~>===UMlkU!S&`=fQgUl#ZH^*zgu=62mB4AHcGNPlp=TI!xh{dAySWF%`OzmM%JsQtblM{UiHa*xL=iTKJy1@_=MqBlPY53tE)`l<9#Ke_I*3&+Ys zpBM1GI26$#W)F#7OdqtxmhO&QMEBStO9N=P`TVUxjKl+B9pVQ5g0W!6Z#32z9gWsU zN9BP-qq@&Fy3`rFfqh|Jw1t-pw6^I$jZH_LUq{~ax9kB99_#bB*f7V{8Cl*9^S~Q0 zNe>*9b(wb>xNbn=od;+hgi)Fw<(&s<{xKe2Yj1vHAQb2$$Plyyopmtx_#3Judis-#4n>U6X7Qc6Hj411ATYX`Hs;1E`G+k z?$6@Aeba-`8D#wSlUbEvF*lUYvQYlAI>7RP33Y_UeTfLoyPrV*|K-HpOCFK>l19x6hVY%zs=1ey|PX=Q01|n#G^gMp$}}nEwfFf;D|Xe2@D& zo=>v4kH_=pSQ-o5S2CYsaUW0Qtv=?jFf8VN=8DDr+?5d)bDqCq**t&E;(6wpW$Sa- zQtiHQOflE_YZlkJx4>TVY+=r4-?BKLf6LR$9mvZZr#B-jO9Jt8vLzu%iIRH zV#$0CpD}0*uGAS(s+;?a^RU10xf6zo-j*opuvEgP(k%OkJYd*wr z0>ZKz;GCl1bPfny5b)4|&=P&Zm;U_Yrf|Zyg!iNW;qx)VX)7;ye1dauKycc`r7)Vh z!S!2weT2{6$+s8y4)@NtuY->ve2z}OeZY65cfLpmpN;TwoqR*UceHoDA;Kr1f4dR> zQWCiUBfp;6GAS=764BK}LLNSm&^fq< z!S_7aG6LuZ9CjuJgx>yCg8w7VQ^N3YQWzOOCA5+ch{Z{tFf!;91Y6RK#W2J|06usb z4Y&aN0R^W|a5?>gupe;JFSL9~Gq%Ggl2&Y!NoM%8l^-5V3a*oW!3Fz691DT)34~AV z2aZ1AZR}2@m%qib)!ndcB zZwUB~_Rfc6j``dq=KbXP!}lK`{9(dxK1*)%eNM0qLqFra_k-s(YIl(E?(4LBKk!ZV z&e!SJ5aEk-@*M!a>E8J|{koU%4R!Jz1il%}Cz#{NenPOuksvVk(^lTj3Fer?pjXMacZVOla1 z{7`rA1c!}M8lTTd%Y+_!uskzkjA#^rdfOF2{5;o}U*;B*}Y4m8WSXk7`N=dEi_zXy?6qTwD zXGG;vzM`bnf>NoKO2u*&TBI+ki`l{=)N4vcOVt*YY+<3Oz^RO)!s$-2qJW(X*_>86 zS}CWuS#B0~VD7!W*_Pt75ZL*)&sAuJ6;BSG$A=^xX|I{*P@SH?le)u1u6MORPUNq5 z$)6W*N|~`8JXNXIJ|_CUU(?ex1nc``Q#{<^v-CpRV?Q zMdV?4Kj=)qA@a*zJpVxC{~~p^M(A@Q_wC;0oPjf>d-4C?edJ%ik6gQt{9m!0 z;Zfr8?RW4p$nSN2_#=?t>-_NVvHTwAhbUesRy7odLsB?C~#V}1^TxoN#b_Ky_1^{M?7hd>vm`nEpZuTrG@Rob7qY|;YY zLAbDt_P<5?r)WT3XG(MyAb%7ttWWpT^!pb2on!lZMyl6kz%YlYaL&-_>7N5k1+h|CB{c^}E}5m(M}-ZW(5v`WD-r|J`j> zsOpyC*1hUqBKl)ksUuPSxh`^Ar1%Qx(AamVqtEfcf^>#B7W-R*i~66A{|3mY?;+bx ev-_>=wYJ2{(0-}-3S{o3e}m}%wyUyR|9=6!oDX9F literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..ea8c5c45a2370d6c4b65fa36c5704097b80e660e GIT binary patch literal 18056 zcmeHPeQaCTb-yB?$j2v&q$ui%qG|e>ex_xHex|-`N^w&;wKErRH_y#n*YmlV$8a5tU9PW+pDoGTfY+Y8!b|7kdy)`8el2v7Qw2m8G-`C_D_Mf!;1bh zbLZR-iJ~l9HvbxyXHdWQ&b#+_&pYSd`|dg9^{0;>f67)@*Dn!v*vb2(j-Mm+^Ydd* zvXkTWC`%C+{B0s$;s7z$?Ps?);2$`dK$(|Av1pTv)5Kw>}ZR>G) zAG;P__2313dh(BY$+_&j^|SMrLftoXQyI zvib4s=+w1;BmUQo%*bRuJvcEvUg-alk;#ut-70>=$P5kTvr|*)smaV>_RAxq69bvi zuNV`1xof*xIjLYBKAjb;FzGjSO#z&?zBiXP1$z*o?BgeIme0x=tKA+7` z#lI87A5W%pxqmPw^SSeIcU8r;Cm^pIQ>%l>_%^$kn#_%kvT;SeVoW_hHZU=| z^4M0JGtLdYVN8$bh9~l4RQdFp_I2Mirl+z~>7nQ05psiR(0uUR55YI9I{ee}x7k&t z65lkQ%bd>+=O@7WXvY4p^Vy&PhR+s$09PY}NttRW0r$EPBg zSvDRx-}&t@`|x@H#$Hw?4$x$W8Bn}8cvYO{n}|Zhye;NSMDYp1xkhJxn-67Yw*N)z z>F=HT(f@S)%Nd;S@MDJ$-n-X9_1&uF$yN)3i%2?c<|fmF)48Gaa4M4=$PNs2XEWIY znVuw9#2CpH`cgxgfsUSmR8O)u)7{hAc_7u%)6v_})7#mb>>V7=3}t$UpeBEXt$zfI zT^AdKFot+vZ7lVFOJdbImZ`=z$1*;X1xR1vcE@rMuK_hIAUT%X z@fwiYzTUB%g6sY|+j2k3>ut+N051`nR!6L*x_V<~iP&PFUcR!V7S#o{tbU=ptE)@3 z)P=IDzI+18+EisJMT$$d&F&Y23&C=b)(cv}g>o>+>IH9D^TFVy&%?E?*tXDCZlh(| zthR-6TN^9WcGGH`Yg-7m1uZoQtwQ^+zxLW|>a4n8+l==bPU5>W3uRvPcyx!?ZO*yfw48&u<$fp!=&*Y|<{aF|`cC3@p&ZaT>AkX4 zR+Lz|Tqb2%X6W(0Oh}N(UfFV4F3|077NG~g8u5#ZPI7y({z~L(i3DQBfT~|n6uqHA zF|UAb4KCT7C#nWcG)uvdZUuvyyCoPe1%t6-FsL`S1x>d-IP;3Cnk5zPtE$$}5{Q=q z$Q95VTLPwH56qOw-_iDD^2(x%{Ow|yyb9Mr(rP)Dc0wPt$lot6*wmP%hRjkZtXrXw zRLqq=8T)GU)PEfR^4VkBZV?g+(8p-`+C3hAD0A-MNt;H@J} zAK*cKc53mGrp1byrh9g3Q06O8zm7EQP~#<4jTKc@_q3{J>#SpG7wA$3UC=fQws34Y z95G7~UAH0;&?lx@v3AqDHy$s=`*PPJ&$^=6n0UWN(XzehLq zwvdJQ{F)rvEzWg06cgv|a%hh@PskzCzw&bn22mYH8%H}wiDNxS2S+DInWKxN!m)v) zn`0wK4@WOYAIBz+evZu?0~}j8svKK6206BI3~}7XG0bs0#|XzA9Lrv>1skFI}H|m5`+F!_h@B*&%g^)KS2&wc7z8?tr zEPU>+v_R-iWQyHHySoyAP?xk*JVcTdA)P_-2-dU6t;NtQ%F<;VV^z<`c<-)w2{}fr z|N402J*AW|3sGYKbeh6;u(?uscff2ZNY7iZS{>kDPbE=jGdH( zFR-mA#|9*96-(qZJ|$*>+DU& zb@Dps%f}XSp1RIB&s=Am&s=AmCvP*(Q@0uCncGaKXKv>iH}4x8$LBUNKDP0*W|3PaCt)BChYjIePuQjvZl*(OY~jO-{pRImp9|Pd@gde>}T(Ny1ww8 zcwds7R@uYW1at6yCm^e@cUqRsGvk@_UJZByR*Tg#$JP=7wuXSYi&&8A>i@;h;guRx z0O7S6bh-SB;tD7(FQ7M|l>Msm_-k)0H5^uFaP9d!Rkbx7W#y#HC4 ze_T8LrX@wq*{H7hoqBwpuL)*hy)zH%9Y3shT=4OO?PbuV8PE;rcj*ctw|>zCf16z~ z#qZUXW^YU>>jCNu<#2%cA`q}CN7=cuJD@bTpf7al3qSY*ed+?f2Eax@^lO6)dsKrD3!99=ucOt61^p0ABB#bUa$2iC_s z1164tpmXDVQ18F-m!HyZ!PvX_OWCcEP~F`{8T0HEC9Q(O&xMfL*1I4t`# zY6st8joCrdY`&xH+}ETj(9aZ?-ADZ{gP)MEiSxD8z=wVpe16WS*1(5;7ktf}ueAm~ z^gHF_es}RP=!54E0xyoIFrcqNW8(>e@aBe?;`3^3JZU~^SNX7y+12N>$w%3_ywj&N zH~AFVu2H-CxLw=0-rH-?yV9;9&KIeHuhOpDINy$Y@+tIr>kljJc@@Q0f0UiqH^H>u zpeXgwk2bSw+0Ell#lyym+s(!b*tP7I=~!{Xn37uIvs+%@@0GQFFY$hlG58i@W(S$2 zF|^ZS2r;xzbMbiKlHae(%`gUiJ{l{_#I0bg4A@{XbT2cM?ah934$9+u&_&#hSPsyG z=H&uEp_&VYcjFi>cMe;%jj{=v0aV59-sAjF*zhSTUK@ z)sEz99N)BmEkB^byK-Y3pNK6d46|f(>y}}FJ_j`GK+0_G?TeTC`eMbtK0Wlf&fl z#Y94H-3K~th~pEn&uv*Jh~qca3F3IrsdQg`8aL{*A&yVP%An5)0&)D7>;!TAraD0! z4?0aE*M0T!ZqRGNmWZ{AaSOIr_Zueo>;b;(Gq>COW;W_$n5}kWCM4o`@R8b+Fu`wN zFX#)@f2nvWm5LQpDLvGiGQ;-NOr`Ha`x2&VPe2@hY0*JSi;kkq;&{yoDY~=oJ9LK5 zc7{^I=`7m1^=&+^4;}0_b=gJZhEP&=?H1?VvMVOeQ?hH1I6oxASKI%;<9dxh|4xok zj=MPO9Cve!aooePo#S4PagL91OmN)C(crkBW0GSB$4-u29J@L8a6G{AAjcHPUXFbn zALn?87EHB;-Ftzk|Lh^O-@ieO-@rhi#!&&yy${_$iHGfq{=R=ox3ZogglG+ ziW9RC|3h1FJw$T~vplCT8-=_8m76rTFw1ibm!FhS7bri9xr(PR^BlwF0SR?e37Mq% zh0A$KL;h=X66y-_f{x{{y)3l@p1$^~gt~+LLISSOUi*%e#PTZ43K4`v3e}OurZ2iqf`l zZbtvlyv6j)zr*xA{|>X=3-2)9X5L}8%fHKXI{z-y>4kThPBZU*hqcAW5c)d*9@Fpq zdq0yl&i`&P|GUxuFTBTepLvh_9eCK7o=O`6tKKfMYHHuW#2=kSG839seH8KZW!F&p$Qp4MF}XZqCMGE$d9g=lPm- z7QWw_hwry=E#3$p6?DP;Q!60mpHx?Z#>*{`9By?PirSn|TAK}}oCs52cn*h|FT!D) z=O{b(>I)TofzOEfCodr8pU|&f7x)bP>TQlF<#2-XsfQDcFOjgRN7=c$ zJ)wB_MHDZ5&w{@2a6ZgGAs^*D#RkS=YLO^;af!uN>McKDvzX45R2&8(};KiO=$Vh7n?n~C`+NMreE{>g!H_-APT zN%AQ~hWwKqo*(m15+LTE>l|^H0c!`KL$aL%$=R%KhHT#~?mG=AV!s^H0c+K5c=; zLalbi{1ftG{^?QmuCy!WpO7z717D?GG5>^on19-+-+l5=PKEY4>_@_`61QtTw`*Ay zVOX1@liZMsQqgrQd5@Umf*rg#O-G+3&ZeFTQ(3zuL+9cGtjH z>7OX)i`Bqa>7QMkZ%++;oB0R(eLL{t_YMipPy0RiM`;Yu`9x`k@01+(nOB`MBqBG? zCwsZx`)bg;(ynpNXVkz~Y1hX%-~Jl-HnVFvDEhsd_4^=yM_|7Xuzv4&$bO&X`gWtf zUg-DU+WLJ5=j*9~uhKuAobNyle3ky`;(P~d;M>eU*zZ%o>i|5&`9;5n-*?e|?`U>E zu-}6ZH@ruCx!xbDLGMbt_Hn)^YT&E1>*Jj7!!_`2X4kT>s^9xOYxBJae)a&18V9VQ zY}??oUAo77k1@u@U)o@ul^=XQu)zm?;DbJJ-##GrJl)_Y4TK%| z6Er;P8IWVRVF?acb0gzV`u}owPyRAENR73R-8Uku;L< zJAAF(EEFcE`u6XK3v+tF7@Qc}e=;+aofxP4NG8T6rwiHrqoe1?_D$v|K9wCTX!(i3 zeFqI-&^iw!JG!-l1F2*()6v^~;6P_*??9^iU~lGNYA6do=kM+s8tm#C?8*%E4)p9L z$8+Pb&#F&Dg^nLT!qw2)_m57Dk7V=d$xOk-Jt*m^iFCn)Ju3Ohv8nW6W;{JLJvlj% zFF=XGbD5Fc_z2w3riZcv(!fu)AbpDh+xL=SH(r`=|1Q>olhq525b8 zt+^_N*907E`#Qu8=1(UX!c7_#$2NNIBt^0RkKc=P`Gu;8>@5+zp!sT&+}aVo2v4k;c}m`zWhxtx2kx)%H?|J`ttwC z%#BTtK^b@x;I>498}$mA0YDR)VP_0U`ct_A)X0sK zbZ$5e+h?=sj*f%9=_E-H45Y^=(sVLE*vW2>4~e^ju;DH@4*O(>Cq_pno*?-EcdWmCu;O-CvLg-rmCdE9Zg;J_odV@nmBrfK>u+;q>9gQ%(4~Sr1*M-e;q8(h z$iizV-38hm5TYzRAs19usaCgl9b<2`J{{|=*ITVn$9ik@f=26}r(?Z2dRNt3)3^gn z!k$8?jFJkSdqKx7?gBN&r^nK`&+_4F;1=tLsxgqw$=j7sOJXABlRGt-&lCpDtyhJ405`o-6~Q{V?e*a*=d**;baU{-RfGNQ zIjCBI#o6d6!F{rm({yX`YHz5LQGZv-9?F?p*Y5$FP}4^~`NxMp_R){5pR6C@3SXDF zEx+y1vj~;0~yZuU1iC#47?tydv78%L{P~p{rn%Ru0OKAes%fk>yuEtT80dwea5!-x4LWwZdJ>$ z^j7Pi=k?(yLF*1tpRJ$QZi%)8ej08G->;Q|AJD*WhVZa>M?-t#l2@2h^#2mbgze#U j_4v&pcFN+FRq=*ry#}}LRsZsbSe+%&(K&_CTQL4FshjGh literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..9efb71a67d7bc9d719f1852920b0d50fb42828f1 GIT binary patch literal 11264 zcmeHNZ)_V!b{{R5%jJ?HDT-Q?G)=FSHLdVsT7NCc4FcJ*Q(vJZsxJ-F)XjQHuC$CP zlHec5Ii$);;)!>GVV#XjVdu`M@7i4M65L&a;))y!NJ3Gx_o4k@mQ5fx{NIE8NB}X`og&o_;jbwfLy@+_V%8@WZGtJksRFO!Yo~fiz@D)gJha>Tyu9lm1x0%rEEuxtg{8 zxkEddxxWi~xWC;d+!uZ(u)$m41sWuVw1`~L_N!1(08c)D`nP9(5259s>hoE%qNgqz z<${r~T>D29{3xZ*mC9PCSSwVO_fvX#uJT^}rzt&~EgO}JRw?Njow~zp2^rZ?!u8s8+v7&6SI_l2$S1=8ZyCo6TJ|vKsc`-(Ns~ zVUI`{d#!3cpPMU)KYvJ$70&)RmDdY%HGR(b(TgRc@a(km>?bXecF8DLa>c@R4t<)M z*Dq_cdA+K=Q7m7i2H-Av7{dj9-ncuUy*e~LUo^DgN)`CDIlXGsN;8+frm=rBvtJw3 za=E`umCCtGP}?%`!5g5DQkB*uN_|2Kl~OLBC-Vw@kgB{npDyO_er2cKr7mVaP1Opy z*S?Ue_k}+E@g7>i- zr+z|LgDN=QsQv;z4p;%%zCl%A#&RSfwqCRK5+i$}kiURL*ikrJ_UeH@-1o}QU;Z1{^@SJldWRou zlLMZulFhkO&!e3-gb*Q3Gjk;^Q_E$wk=YSrTp!9z4j(s0#|L$4NNO%uJu#8h(?etF ziLt>+eROPi`1r)o*wEzA*yQl!;ACc2&+3!ebb54rE8{N^;aOHk{mF{$xDN_1cKr<1 z!4Yh~YOnwQn76uw4cl%`*ud{(0oq%*L)Zx8GN6JR@WRFbE&~qYvQyZY0GR6HHl_eA zC%17La0PKn7qZs7oT-I%#3kO|xUw$SuE$ z`g(XZ+zi`#VJp1a42Owc_=dF{4zGL%%3M9N8fixCHW4eb+KfaxfUR@??REn4ic`=IP;mHt<`UFn zeHZe&u^iA<@?TkRN>ZZPY@((p67+emBNRrWSG3%g8|?X$HN1KV=zD7}bZgCdC4RMm zLWz1vRhVbC6|l|Y7R_ZOD-bNR5ss=>IIMX4!pTNBoT!JxYIh`TdL7|~+p=sn zWT=;A#nTr`HbR&yq;~g(Ovw>iXre#0`xDXaH8=YHS`+;M%3-w664npFShLWdt*vr$ z!jhwABN|h!XjG|lam9)&X7{1~WTU@7QSa|peNXh8P4xa6;t{IH<7W54XtEKFChF0s z>f0ZMx*q~>7b5lm54Pu!l58kSqOK^a?~np*J^=kL375zf~eP?gDVgt-{F`EaRYzBSg_+a zkZcSLB)iVlTxI#XgEM#eEdR6eAR)6!%k%Q9M8~PVpec zrXX1G{;-A-|JGpLLul=;7jKZ%*GwkGhz$GPmMLp@dlZD)ea__FDnjl4XYv4SwC%BA za?eu;wZ{eT7YMxr&;7gbvwskYHjX0Y{@oD#q(uWZK8*%#gg;x@cpB?j=v!;<+tT{C z@Eq%SKf-(et{gQGeXh)7{u2i9gKZ!`kNKZ52>&UgpJ(rg`JXd}dD|D% z_qeYU^QU;+$20TicoqxX*Qxn5kNbFTp7b$ujpH%*OVrte%$R?S#-OtT61aL-s{nK?foi< zZmjj9n`_Rc=+$PB?ECilA~180=LM_jBWHpo*xwPzoKC^AINySANw^vkLRO#Ew?xho zA##R*wF_DBC#|W^KZ7kbECIr{8FssalH?9aZa<(uBsGJw^jjZ(y6!nCFW@ugf5#eb zr;rEGckmQi5rS#}*8O+l;EO{+)8hNh#eP)@y42({JrgX#d1o2UJ3%<_xZw$c??tes z7tjkBbgL3V?|t3_|M$8RQqZqTz5ayMR6}+wq>~{Mi%^J@&XBxxI3)G<;aI44EP@aV z98)*&c>ucsaa=uah#ADy+c@-;z(X?c-01#6D5v_%kHW_lfW>qQrxJI2JZvM=apXZZ7ho!n=gzvH)jz zK~NWYKC#U6s>8{fZg`pjft;ZQF@U@X-3SC==XmdP4t(DX1OgWP(B^yKoXiCh^*~VN zdqq=XX9qX~gB|7)$0-)anO!}BfcPO0;KVZ|FS-H}*av

v_U(ZW#~t$pY21719K`P*ru;F=Z_lTo z{kv>WevMGx1ADN$-LFx~7vBS4yI=QHzJpuxN%lB$o|MQqk|fS~hUA?t$sTif95KK0 z)UQtJ*QS@v6(5-^UN4y|Jj@j@97AA`yI@Xnz3}w$Ta0_W=e+d=2jgCVac?0D_O?Yh zOIcq&U;5z7G4SOW*wGkZcaAq`#)C-0KMirR?`i?h;tNAau$9}XO!2?&H;Xv?YpM^+ zm!04bb6Y34Y*aG%enz491`c01q{mY@xh@{WjsAGIH(UzjvhZgJUp467#*L~ z$0xD|{5gGeB%2u-$&Bdf$@JKhXgXJbd(IOIHfVbKG&MsRIF>IK=8Upd(yJ!E1JWu* zt!ly@QMoi<(K32LtJF%RVz~+}G8gr^TwxCC4J~V=YjawzFk95%Rz}m|cBfd;z|PrR z-l!a_lrx(ww+p*4cWZC9q_AB;dT_78Do8wCD2p#i+R}bI??QEU|4!?YRDQXm{Unv& z?vPJY`S&{He?a9`{P%>mbkWYc(65LAt(vb>xp<%@vNP{c`CA=4H>vzaNBbXBd7wl7 zDU}bxZ;AHwODezJ(f%K({C|}8)(HIzl@C0zS^gU;=MQa`!x+sJ>jjr=&=Ep65B?`$J~Z5#PNVL2zF zsb@C7K^f#*-5-7z?fiNXyM?@EL}o4GoP?YJ*5ir?o;+vlpe25mH^qvf2z> z+~x}Ky~S)XpD(_F?b@Vpyp3;Ua7Y{4Ts_fIJ-n%U6nbx2gSDaiHfu(q^``34&DBtR z!-Qo}OB=9WY9pg#+87fV@CsDW&N#mFi`GJIUc;B$yVJnmwd_hGZItmR3A@PBQrv@Jt+xM88f9>?|zi{&TuRgnZ zW&6LC@bZt3TJ~}b*!dmU1$H2#^c_W87oQQBK0608yg>EvIS}_H(lR?A!r#=a;& zrq9lS4B0-$>X|;Ju|EUB+^n9R8yR-$Gy54H1YKC-Tl(yr%8;E?S%3C+$qIl+;K4H1 z{{}s`GPIzsJx#S2Ab$)Vtk2HT?7J5BePio8MyA(cz!=rV?|>8A@7Q;a_>K=Jb`EF$ z3lK5F+as1S{g3H>$-Z}->FBSch@S7%e}ND){m%BC<#W)yQ-&FsK4E+Ce`i}Asyb!3 z^-lFKQGK`?+?1I9WCuAbGJG4fSnS&geuW1Xq_fnq*xw2~%>QisH$cXG57}0lc!Fff aIkhEz=v}Wt)mHjNpm@p$DfGg zv3Ez=>5qK2V(zrm8ycH>gJM`TFjWFQB6BCb2JweFa>vAPm)Mas6(W**Wf8l)+cukZG zHBHK@^{U~2S(LSc{?h#GqMXZVimpp~P0lJmEtJ)aTz*AV^Lbq{$jhQ$`ljM}O`MU- zbw!#kRdZMEE>fnR&nwzz81QV=064la4?-I?5ZP9EXfk~0+OkNRQ8c}zR>DQjPAdDyQfThx z&rc@4{tfB7xQ}5)x7<)$70c!tLr>P&52lDnQn6H%vh`9l%*mPIL?)StrsQ}c78^|tCx%mCFP4g?vUxctr*fH0eC$&!O)y;Fq)x`c-1F()+eo=iaN2zLvO5Virs*nt-s5o`lSvE3##k^pHJ z*GL1ph`Ds8ae2w)JhLX}h3bRa1=SBV05u4;7itJTv@jfHP3Nj^y3`Q z4-j+eJ&yR??hC$!E?<|q+1z|CNCMX{@Pu59LxY(K9`b|5 z4)VfclU#wehnVETqDd}7`_iJ&&bf*I$(wF6*X~E;=i15CQ#XTQUO?;_u(3B{BYgFt+5i0F^lVsYBL~O_*LOVJ zPaXz*=v)2dzN;qa=4bB%-2}+ebN%En;P}OUGC+9)pgTf&k5JzIly^Vqj#Az+%8O85 z1a!wK?+cW-hw}D-Zj$m&P~JYu+XuQwDep1L8>GBJ&`nX^Ny_8N-)|gn1%ikZZGoT% z@QAD5!lP~dK_9e_yJ8kjw#9;eXg}&2w{WU$JlIdi?FZY=`0S+P?gX7PKD#Jy7wDYv z*-d%7LFbImUdr1GI%j+ypu7h_=Zud?c_QeX@fo7LA<(sqPX~=pCyh@BjZY_yPX~=p zCyh@BjZf!zkeE-t*GGb3#Cu2(p7$r;+eU&B#O)-w&46_py&8vg^yS$oj4#5x^o{kC zA;5iK>nFokPlN8;v%{cEgACW*LBIoN`bnJf;-DL+yhD_ipu7a=F!sp9wTZkWbk6vUQr;-&obef>yfM%@ z)10?IhSA&XTSWJwHQTRVH94Q(`P2WE>1dU=nBWBgg(BpDWQL9$I zM6J~7hB8zx&r}9$n)-~AHNu*j9UK$ELO3=W9gc^`GRbIE9!|waM`N*6CK(?~$z#c! z0{@Q2M{?Pbk?e?^No5jy$YiMsdBXi+bZByNg1Qlo43*VtLD8g|Y!orqBI&AR6d@O* z)hfD_m8+6ouhmq|fF9XtxlpPWK(9zSB~vd*rD|T4AparBkVa8;3H;2L%8EXuYuVd8 zw;JnkcR7HqFRWYvW<^;(ns&42vz@%{kngbc?c|pZ`Jg5LsQI{Kn5U@R`rmYg?s+QT zvO3iN>|5R;w+Gl7Pj z0W88W_LYr9dZuImqf{kX$kLUnkTK7z<+6GTN0Krbsj5oWb2L1%<|I|MUXd^{xGn-F z`qx9qC>kbm*U7T(B`0NMO)DvDA8$n(*3KQTHD6~XO=q&%HQ$*n`)z`vyby zzQM+`wt3b7+zl0FZ2SfW1PVjbYTf!t-@3G}hTs?Z%dai$T%_N5m%rySp2Gp$_K}PC zfQ6j{>^my+pS@49__6c!0B}&o_#e`9gnjq*vllNk4#hD%KSI~RODlzN&BkTo7vQ25|F@Lyb|{_v{{dc0)R+JO literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_splitk-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=64-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_splitk-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=64-quant_type_str=per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..f7d0da2761402b32324481d3634a5472c2c86bcf GIT binary patch literal 9992 zcmeHNU2Gf25nhrf@^qpoik1}FwH=0GWSW*v6h%s;>^O|0ICi2qiu2PS6uBJ96Umq& zS^mgQfhv@&*hb=5jsp})`_KYynzX5t8fgm$1thgVfj+cAjl4KOo4N)1SRi?5k@g|% z%-yUcpCw0v`%=IwEN6FSznQ(+z1h>vz3|}JL94}*5SUSH>>9IB8%cuNzw;T5u_olj zSTp>#vo>Z2HewTo_y+nJDdSY6ameO2iFF-G&_GU^+YloTrodnVM%j8jt$U0k)| z*ZP-8zV~`sGP(#}gqButef}UOeyQ&@*j3A7HJ{5A#9!RN zN5JTh<-A(R)YOdj_G2Zj@W7b&flJFQWlAeoa>c?0E4wUDswb3*yjoR`7t6UivGJ)V6#N^X&}m4?{YoyCb5m*+M%-`#>>atX ze0S`)WOOQ}Tt08yFzhY4^8947nExoaL2L4{^kunL$W0W>lls(^mAPBqmunTRqNJaP z3&^Dun0M;fZ@}lvQ^;O2I;*Gfo_t)L(k9A9a6S4kaglEFDtKM1egM5ps?Y}nE8P8k zeVK_9@kp2l1MXD|MHmPeYZQ!Q9vegzisz#)W6*9O?KX&P{5&A+lgMWh<8heBL8QEH z!?9k*4ioQhi`^%${p9iJt{={Q7vmUKoD`0?L0z+K(D7`8<6wc9qGWR=B~{C%mBG}| zP;wwVsK%m^!MGM)a|$nNAC^Tv&|6Ij1z@3A>>7C*0U9QFUjsaxzz`i!miCA_9OQ2hwE*_XPp z57dhT1p86|`#?k3Z?Z4NfFvx|r36q5t1rH=bat_B6{qV~391vS3#uEc2Wkh@E~wp5 zw?W+wbthCWR3B77)ShYEY}YwaU`t(~^RQrD0>V(QZO(Jf(=IJq*o%vd%`DnZwV!jg zJL|3W*4ag7?U}axjb1s+oX8Jl8C-OuH=JdD7@+HLZL@+7=Zy25#H4}q&FzBws&9;3{FgwR$-WP$_ z84qSVZNX{xto@wb8k=@E9N%W3%-yhGKq}hBdb6V$Y`&)36xsQOX7=hrgI$2WlhxU& zg*rP8{WA-8ht-&5uV6=dhu@&x0fTPs73>YWz0S}l4oAHX zmpld6pz9Dxhm&-KNJoTp#7M`!wRLze)~Emd)46G{y@`#oe_TnN^sxtl9ysk|cU?RS zvh&k-f$U-69h~>EM}QuB&By$N^Mh=RaE}u19>U!NvLl3hjBo+M1wb}VxW@^%n{c~9 zc9d{W5U!VSy&!v%a8D7gk8ph;dzx@xCY-?j`r)X>(}DEiCQpY5bj;$@>5(R1hZFka zmatBbHibLf(0|f$K&MYP9q90pT_5N+`{!=5dpF3;{^=oH56H~^*+sZrAT#?XNVp)# z%>LO!xIG{<`$s063^KES`U%$$vNiqFO#W#l|1^_-TFF1nEf7+z`mj{uw6RFv!gQ86n&V$jtuPOSrutGy7*h;r4^f?4SDxcOS^s^v_oE&kpj> zR`Smd^3PWC&kpj>R`Smd*n<%Nx3i9}Z$M1+UJSv0iRtldC#}t$K)3J3c=jyF(t13L z056pCu_(}R2IHB5!+17CxM9NGLpY3QF~Y?O7a$zQvk}7WCERYpVLaPMxO)lLOE`>Y z`w4d+;ra-N@eJ1+=ZW_N&J*|2PKsv{if2O<&teqMMkt=`qj} zZnBH<%cCf0kt@x>lcF6knaC9h+AYy8sRi0CXxA-y z8y=@_r>WH$INt&NF1OKdcLnCwvFD6-OHRZH7HNw1L+OJXdtP&x*aXc6)Z-Ztt-=gy5_o_-CLBO+ps^XNBv*=Um2KT71zec)w@0 z_M8^%unuB&#wvM*7BM(4iaoQU=$`=@RKH*Y8T4(?=*nM($2ozC`K$2X zqyu%n;&S_EC6B+^=E=@@JbrPDJKN-X{aep@aW5VEvnWW+Zuxv+7HqAyvs(Wdr#s6m zE4tx-g=O9A>6^QWzVoQhx;cHPZldpX)MwkAzB4z`_kGkSY);=xH_`Vt>TBAZzOy&c z_cPRI-<-a4a8FxXgy6g&^x*!p+N`j*tbQBN%TCybj{H^l_e9@2*5&oJ`Yl_m*XM!~Y;NuxIPH>un$0-UGMyz^3;A9D8~xQKNR4zWJrK;X?G1WILg9dMSC^Qi94ku%wkU9{L3=M_D z@nkGA99M^9X$@XSMh4TV!NJs^nv5r-yV+Q-0C}PjFHUG|Y?P+q4fN-Wg^X5KN@_KW zZ`zegQK@F(O?;@`1~ht1y--^6B(b@(y;0zNgwt3bJR_&->O|Ce?6op7KscBB5s z*5Rkt;U8az{~zRAMOLT0F297AfxlK>;eFt*l~?!}`D^4Am|Q4UH6}yy1X40g&J=2L zR;^?i{UX0wW^%chR;wzLtJ;aGJ{nUkz}E8RB5c@n8n!Ewla&gSjkKwpsu&3BVloDj zPr1q3B#ePO5R+aE^FUQK2{a34jFv%}@?5S8h+KiCAV-rcK*n^Un9moFqa`JoR0>5! zzmEn(8%|ObYLg14g*FAiMDJz*Nv({DyiKC?Ye_3fwOr0=8@IPA38$yAHat;1Wj2*c zZ4RKP4lAj$T1{m)jD-D!$vJ&~;22ELZ3fZLTJtB8K(0}H9h^g zya`M^eVt9b&8*^vXZHoEGG+9uNA7?0@B{o-za@%c3y)X~BUsGeL-6k(3@QGTaD726 z#(IwD=Q*d((r`S_(H8vws26_Dw?Xe;jYg5<`FYN1fW~t<#}ivmo1$(m=jT7CW;~xi zr*{Gt(t68y{=C5{f8OBs^f1pmKzBh!9=E@Q4uQgHUGJ^@BvzVws~>(5zx>$3&qeyq zXZd>`jx$XlN?5!GEc_hc-_h{-^XDn2MqE!{u4S!_;){U?kH3a#bh4dbDQw| zJ0bdR$UNTMKe~oaw8QOVdv!76RS=qaIDVt}DZ+;kw5o7?+@zd$Ieh^zeC=1kS!jpR x!j{I3abGYQT3{{p6uUvU5c literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..f97220014f49c0db625c35b736378bc18c3d6c03 GIT binary patch literal 9160 zcmeHNU2q#$6~0=@Ygv&U$JmYh6MLfw$93biEZec&GN90uhN(l{ABF)sY*wq)Pc3PM zR&r8)9NBe#U^1r7Q#%dAFhgl*o3;VUqyuAl`qT$_Nc++b9VRdXPw-F%7SkFR z{NBTMGZEB?joRHGk^d%VTzbh4)qG4+Z4qTMZPtBTN2QE*z{Qb5emmF4`Gr|eb(ad;!Ts|)k|G5~q=B3Fv`M3`O&Ki#g)TG3* z@S4Sd8_3J_khr8KYX#ej(2zhLIezs1g)cMqmz z%_!xHozKf^A!%M(dqq~`@q%ueikVk6{in%{5mPfS$VMVz>P7avY^J}ZdtZ{5)J#cN z7Sp--728CKl@bZP@G%a&x9R`_y|oAex9UK;RR<%t>L7fp4rV{r!IS)nM{QG2{S&_& zdqT0Ef6eM>Q&z{CvYP*2R&U73f>FvVrk>2|xuTLtpVH$BGU9g<`<2CR;Bl>6&7_k# z@9Q7YBVgfYaz@Q1OKMWT{9srMj>m3uH1~>^$WRV>ZTHZ z0&XC!DP~dC7T<)YTnpjoRl89i!mIL0bxBVY448VH!{`-SR*Ep|dhrctCAHib2)tmu z85>I`PR&h4`0~SN)kSL>EKwgRxNdg2VHKnO<1N}x?gafFx5%QjAV&%LDC(Jy@pYfC zyD0h92lc4Lm>*QA6YuH#Q}AT&{DGMh)3|Qpg(11&YE%%znw<7#?cnh+MMfb~-$#PEAK5hi50IC*~%m=OT0AIW3{a z)wy^qHpREhEwM{>Yk`!TwEu0fp$5zg^SWMTYcK(PziVXo|J0RT;-)p8AZ}tzWsue_ zxW&x?wn2`dfk)g7V;kfw$f!%$jDln>!sbzIgFFVZ400Z%ikIk&1F3e_d3^?}F|=)c2)?HUGMQ#lP00xea8k$e!6~WzTN( ztcdF^V&BF5?hBRf9`T}B7S}sO@uFAsmfOm0VE-e_-pej+w6Py<^mL=m;CfXY7_L^U zaANGbz5;P|z_=Yl0$XE)(wV`*P{+_fs$*4L7X@j#%+79<*}07>JHH{e3U1c<*o_W$ zwzU(b=WHum_{I%?cdLl9J#8zhC+vqv(f6BjlVWrLQsl3eA zDwZEF`SFvV0rDd?_S1bOwEUl!k1cnLEo_+m?RwY2AUg;$^gxjHTv>p=N0)n`?-1xl z9tpBBkfUD-vH;-%&?ggaoN&E_>xI5B;U);@VSjvQ*wxjI@=!}xw-==B3R*JU670T% z{NB+Zp1tIEZ-aRH2-nvjo_@mhZx>G+#nVplv{5|m!R}A;Yz8Qnfd==mk7C)^;2tEx zNe%8{KjHRocMp5$9(w2=_Ru}_1iJ^x?_h&?hRE+wgLs033pR-70O1a77f&z6vybBG zrFixQyI0%VRu-zO70%xkzaKcvDnG`x0N;Tfb2tDz9k4ju1w8Zj2id1Ub_^khk3rwO z#o-aqA+PTQIs71U*v27;M+tWq;d%*&97gUV4!MswThJjh{ZJpF`24mXU4I6Oie9wiPRLJq^)*vY#5--LBz;K~5ZM`#(j8KQgr zG)R2!_g$HXzJ+DvCXSEyit~Z*y+kTni9~a}^?PjuJNy z5jSOvn|3_NO=mnqyVC+-Lh&w>IUcw=_@SYKe&j4}t z+>(e}LE=`(;+E}~bE`g{z2q0U<&39~;z4dX3U&)@O13jsWrc%km?5dxNO55(;ieD}O~ zt>QW&EW6+w@^?WHT=&^^7}wY07JOCFYwi8dhHnBAf+U#xP%U z=4}`U&c|oocJG2SN2{1x@pwY+7Eh|h`^^$%<^SnXR^=S&e-qrQe~$FObm(8S^dD3sTg}<;Nsn87 zeeqip_d!*vS3>>sq<5&K`E*^*1jma_7WU~F)Mb=CkK}}4pLMl0(EoH$5RuD{K zGtBB1)fmVW3}e@9$&|;_ML?u;OoNPTItO1|5=JItoWzloSWL+oinR_*%rWI;3SMR< zro&sWOio6WFjwihI9Jp`A`xc9mA2hU^Fhp9a%_EW*B&$uRsfkS|$ZQfee{ z=&a}L(Cy?YHXYr^U2pFh5ze->;|$MJc>^Z#3TiUZIY6vy!FIOw+E@3+ht z$2;wv?S~uqL=PNqd%OADiC3Y^sYB~#@k@jcBdD%${G7u%Z*qAWFtul2hkuO-evLYD szVWEXr<3#l9O(Gm*TGq;-)=oyuY@`um#d$KN+Y~$z_utmI-U6c0t#1UAOHXW literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..7a79468fd45e7f13ba8e60a653f12fd37703c0c9 GIT binary patch literal 9160 zcmeHNU2Gf25ndjt6Gc-Ntw=HTf1+7gajX+1QGW?g$gq<*sN5)j=nsNkj^yztB6$Rl zl${opPmC?2O&~DfuxO%bKVS9r+o$R&qu%^BeYXxPnod|W;+kNvHbu;yi#e)4f21)Mb=cW9Rq z$HHq09j+rU(@Wx#s;m+EUV@4Q^4O^p56wNv*mG~HxkTDf<$0}`*RsZoe`2k#%WA4n zRN{I$Ukbb`tHqS@%JQ4Cnn)Bi!%&QZ8rOcB%IYyS`BzV% zFUOu$toOgw>%^{lo!nKg+5gq+Z8=rc%LT>IQaLSOQj(c7T0%ia{6S*BvDghf-fC8} znN;5U>rd$sF!wV#tL9T>HKkpCq@d-G&q~LyY^jt5t!QNQ{3U^1m2>JDC7D%A%4xkg zPY&Ra9{3*eYEJv$u=4ehXinFZl$OgWF|`y=&n-Nz;Llvrk`W~X<+pO7m|0Lu+Lnj6 z9)P_r8(ZJwz!ke<6f)VY{fV;IWaHUfOwWFBWS67K^NFi+IiE@D#hm53a?5wuujI0! z8A{?=cz{e?F-mHD{vCMBTP~cuVpnQicuPL5E@;W34pWbFII~8}N(p9NE4>Z1R6IWf z0xwu^hK5qfGt<#gzWnf6bjVjzlKidFMO1U^y9jT7rNwzt6o72+vT+HwjaD6rYPx5 zL5Y_$2_+sG*T&=8+kBVo+8il2Y5&{Shbqk=y{@;|GE4y9@9OpYf9gz^xM__ih@0q> z3DSB5x40R^I>gZW&VXbM!sZFAgFFe+1UU=R#mtSfn-@1s!MkV* zKDhjF?T0G>S1Vj?aNWJ=t{hxF?`E60-T%%lY=U5jb1w&011-J{7dyAX*wTi3rDfIM z;y0Vk=E?>W2EHqJ29^auT4ADuW4Rl87b_ky>}RdlmOYF$gEWr-7umUY{awI1bMCsE z`Q6K`GwkmQrkh#d+7dk2erRvL7n-}f!~UM`w6N@7^)LBXeSV+0&)iq3vhS}WYen|b zdLw&zy?sesZ4f&z6P(w#W!+@w*Q@Npy4WbVS>UM~Eo`MRfYP(l$mYIr!{62@qTC;B6sw|WV(eIL zY&7xXD}mo8`_6j%1^)~F%EhH7zh~nOw!C54@sb@s+36-bQhhsZYvINJym4}|O>AHT z?4xU~hePZz$ncj!tbJ_`+MZZ!hqj}j8+<&(hCm+qN{9ss7lbyMaKnV_AY2EuMF=-S zI1l^Fdjqc4Hk3yjTHCxJWmm|Ok%my)-DLOfI{q9ay9ev|(@D6_I{tJKu4^ZMn#iAK z@~4UXX%4l0o_EttzI4}lh92^zr_M7-gp=w#LqFmAcY1~c^bGCv3?`%P+O&`X=$!n+2H(L^810qY~#ncPT)7N zXAXA*Ps0|6!@#rseIfP*kjUlswOMGJvp9SNbOVos*u5Z;%fU4pha5(JBM$kExDLW0 zhmqfiLw+L;xf>=9A0ZAOB@W9Lhi$vaVW&R_$u4r(=}#y5gB*7H(?vMsa9w|h!$*k2 zM~TBSa@c(8I;>e?l(41_7GeG9rm>Kt&arydo`N}hdJ#D~3ePMnAvOjw5)UznaLCap z=y1-)LE@bC6An2#LAXi6S>HQxbd)$6C5}#59JS{RIqLMMkL)5xo&JQ#ALOXhpF@O0 zj@I>uI66ukjS@#EfTO1vY*BUQt!*s`bJJ~cEdbBv?+QUoWzR#q^}S44T{&J99@ceLffrSA@QaWx6)ir@`1w)nos z&D`e&Azblz!>DJ0{QD4xda8myZ3SsihrEJV6#~!(!3osCq_-+SUks4!{=T#CpzptY z&(kafF@EvFy|-NmTDk+^w^#7p|MROGu4BR?#w-661i|%yeGlXM8r*`fDtfIL;Uy7c zn&>qb1dQ-ztT<)K=L?GvGc*igUgg}oFb>4g=iYVigV>}|OfPvnVRwTk-Qa!c+s{ZV z>jJv~9L!!9Ts|hcK7p9Qgt%l0Vkz^^b!QIlaNrIHEDrFT=?%Ks2TK|jT-vZ$xn__r zV8RoM&lape@?W1+b-bOSc35_;lBPOUZwW}2N^#5yNtC9LMn~m{9D%>XrNMNmR50!v z9)?P~9FyaEZul`Zq3L-mcckYE<&rj>%`W7I3Pt@HEnbp}dVFY71`E>YL}Vl?O~$4o z5p`rbIx#UiIvtyePEMJ9qd_9Zsl`$C;NRivdjFOfEr*Mgwn`< z;?T>a{}_`wHRYIf-^i}^)F|QhG17axw(Dc0|EjcIpC|qA9d;H;f3bgi|K~~nVQ9Pl zhorym(7$QvZ&oo!QCt3ORc=;s3$HB!YvOKJrCKH2KS+9qnwdN3-`+w0_Z{?M0WIIG zI_*EbgZ_Iv=>NEb{=*&gAMc?57UZOu0FjL&$>j%qy*yzQ^!4(D3hL|R37MSNOB$0Q z6$%MfNW10Ba#}UgjDDlORAh2dPpBo8$tCSf$?A=XQ&4N!q7ENiA_1QplVgU#Wc%1f zjvF?Ds+d}ZRB|R)&OskoK`@2QFsoZqV<6KojD2TIraY4=0V0!Uamcu4@^Ipk)U#Rr zG>)XiVoF|DtaV@{I?j~SY51Czmrf6rhcuqCuv8Ieg$CMFA|FKog1 z%xJqGuO_E&WyN>xsFi_Vh{yK^uudY3co9z-c8vr=W3J!w4<-h){d>X|wRqV&5ZMz7 zjt4V{~D z9Mlg-Y^L8FTap}Z!|GDe~EG)fScpk5D$>TLXp0zFU3dqB7p^lHg^?>B432n7y zrnUn2!*C&<$CLbAY3u(9`2QntoWp@}!s0Vnh3`XnFaU-7&*M=(e|$gxBJ@EW#}^3C z|8L<14nSj29K*9ypxcIj$m+&8-r3(-$2iETLkk>lTf6n!*;j=orw)7X7QaCF2!d(~ z$4@(q^D38T0kie)Yw*7yf?uUZoNo*b`E+vrp9dYE`x-b)Q(X9RtrG6}xLo}r+|#CA`2PaNsA&WM literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..667675008a59b20f7437b736829a6f42199e00b8 GIT binary patch literal 8640 zcmeHMU2GiH6}~&WJNx7H#y@MvcI;$WOls0(yz9U0Qq&qHDHKdGpgvHc-p{@~r6 z?aZu6D%BYqCj<$Z5cLPtKD3o66kr_b&FZ zhK0Q5u4d0Y=bm%!nR~yv_s*H~XBNIK_s-cHV9N4E-N-#%Y-8p_uo{M0*KWUl+0jH3DS`kS(n&l{R)QnRk8+V6@LJ*QM&mGwfw)GYFfY?gnlh2M~ul}baS zOXXUA(^a9lMxme?U*I5os}3O0TZ>@mRvk#U>R{pv9X!LwJLRedc8~XKxpUMR{g&2D zM_QTB(|SiP8hWEnO|4kfY8EY&&uV#!BTfG3U? z0>K| zyB>J|aUJUO!>Yj6;d;c?xC=STzw5n-y zsxs&4v~yN2maTg-c_o*g&1Gg&S!HH+YHBW%o=s=dv)QR^DytThypql5ax*7C$L+nW zZJzBv?Nqx!81lVE)?u*teq;N@9YnbzJzx9h^!VsDuguac2U^yieFk0ks3d z&J55Upc$Y(VsCE~;pq9D=eKPkykZMcxMFa{;p%~_53U4UgDd{E#D?f6I|;D!uXS(S z4|UwOUl0OE*9AdBUw+be;{r@_45)n+=l8;2{lfxTCqv`@VX@S|8sG58<5$GEXou|3 z8tSd1-tbUKxDelnLwhtHwY%)DwHL_9(KxiPLVGv(=^jHrqtb=Z(eduFky4ZVVhfQb zlAmsc$j`Qd{o)mIb#Mdkr4Q@2xDgY@D`7EgN9@R2ll;=r50jU+BIM<*{yx+h*@&%) z8=^3}Vv~zoHhFfdNiJ=PA;C|2p1cYDhI)_&E`-S94{rK1KH6>lH}_^Cmv6d1hY#(+i_-hg4qo)y8&pEncXC_ z3y^=lJMQaCAUziBON4|JH$8IA$F4t!xId{#~6l>I~aETB8I*3^fSMRVQ)NxEFQ$LH=cvc4l&#|9){rw zhT&rj!^aWB_Tt~-q$l zRtDPcb+9FYZ1fvRatF}E-$HD;c8D#+0NNoY(Qbg*A+~VMMLS$`(GIaS!muSVY$X}C z#vN?Aei2*Vc>0-N#FjUnK^6~U%Nx%@W{22n8xO;l#ITiQ*cu0H)d5p6|Hw*U{@S~N zVPQxFDgq4y4Fin=jRI}Lo=0Hw!hL&n+}*E*kU*XhgmJ%LDB)fjAi*1fL-@Wqb0Yv7 zeh~N|@Il~%z>C0(z>C0(z=wbj0UrWB1bi6yFz{jE!@x&?j{qM5J_39c_$csE;G@8I z0pA6D7w}!cUk-~>GdcvPfMyfU5EGYULsByycKBE%JaM@vDmD9J4j=4}PFzmJq~>7U z;l%^7iD2NRpFAa9+=`J)fT0Ke?(;{ zoN}EzW_*i4vQAPldm^zHdueJ)PRS{F50xfLmQ^?JnVbZs)X2%IUY&eQ$!mJe$+GCx zdc)EtE0yKyiMpXbrKy%==<11+GI)@t=2Gby>0~aGN-62=%-r16R5q8HIhj>XX7U=m z&dyBd)#+(7<0DIrX3_MHKvz_JDU?eb*rgS=gy zECPPo<;jR#(=Cn2kPL+MB9V)=hFnt25@G)+w+td1dS0;Jp*5o2Cg9Q?MmeJFBTx z#au#;rn4!U-m5;>QBJScs8Y;0EYk$U7*du8Z>+q>RA{rBoB(Bze&cBl5s%-(xG;r3aNbIx*l z9_;ztW52wl4!X5VtP#f>rd{ZaR0Bf^V`e2g;3}Ha`B5$X=nc^e*A(W Jdm6p={|87`-{AlN literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Token.co new file mode 100755 index 0000000000000000000000000000000000000000..4f8595aa3b1763e34a3fa15e7c8a64752235cf18 GIT binary patch literal 8712 zcmeHMU2Gf25ndjt#yDVb#ymvQYAMNxlg&?>N#IEdV+2@150(d&^sk&O5U z-cfN{RG}2fanV3=j1-O0m!d_Qw25LE{c})2Vv9Z$1q#&7Q-HQ`AN<&d0Bw=Jgq^vY zrNuK9)A6fMIL^-Oe6zEA-`$?(UU+2TQNd=LcQEs?v+K-8Z4~p={^N+*Sm;9AEOWze zFFU|QFe7%}?7l^QM8UXpksq4*m}F`d+>uMa*^M|zfiZ3mns!FA-ec-qvOPG=wLU=o zQ?0UWs%+jv4GS(dXxd|swo3B8xn_hDpre!V__%yJPu49V6 zIg1Imk^i=vb}sofbAtIg^o2n_eEQ^Li(g^v#dnoLI;$ztvRWysdF|DIu)en?B~z}* zsZzD5``?h1N=Cc1@vfw#(-l?IWUZ{E)SqVZrKFO7Qz|VjX{yfNkhI*lRM*?mijuFY z@^Y@2-Zow2WOZptt=uMn>uv);qIZ`;@NNTy?>0d6wgFz`>m4&)9edaNt>p8vvHDG~ zL`Pn;pXBwPl&O@eWm!`*1+}QlOS!XZTE>-Tzcx4n>}Yl>`CO*x`qeFVbbsh)QeG)$ zs!B$E>&dcOJh2cyaiwXJSJaA@D-|ya?5b2y&dN)9MVFr|RhDUZi123zu&k&Q)DP~H zzdkWlD5-KrEfnOWqNlPGiz`pd__w(9bWG0Wa=(?zmE4M=tIh28r2e*~H5necVk%lW zm(QD|XKzZ{^Mzz7|G|+CMv|7(SEXt(w^XVWjFjb@Iop0ARW((U)6YZVxsqm#@t>J97i?;A&jx?tnb<;Tx2iE+7a+$n}4+r32cJ}$LvF5wtvmH zW%v0miaxRCtU1@wZv*{?f>~kHx8;L=kIz%!428yeMuxKu_JbW{oycC>akB64boGlD#kIjLoL3*VYvPty6fe3&SIu2> zuQ%8a4SyGVeaFo%?ezDd&+t~?y6LwegvP{BC|eJWW^3%+PK~{^(_rU!M5kb9{xjDh z2d5vUW7EkNzj58}7e$obfG9RZv4)#-!|ANmV14uhu(s^GJN@Up7rg5)uerUB-FMi= zt`WyYal8~KNO3~#_Qk40GT_5$=RuJ%ITQLr6( zGQvIwGWv9c1qc^_zGH+tPPhTW4M3knxN*Wc*gxMNv-Jf~9_{K2xIiAWMGPr*MFM>k zx35h;{S>#qO+JH!8*G!$LBbu}FCRDM(?j{VDW9H5;L}{2AmtKlGlwC{WvI;@LWB#o znZpR-M)sRSFU?_q=Fm%X7>ERhDeiEad=63ELv8XICERG6d=3-t@P7FWQ9dJ-&k*G^ z5(%uk-D``D^2HLn*q%yxgmHa{hYgTT{ygToQvnfFkH-3Jo6+`D}W`W6fhN5MAs zM1(y661g1MHgU*djX2~t;*h%$;&7BWe3UqR%;2yY7ddRrr=Q{? zhpqVxQa;FGYd!}Fha7I34{4xGBOJ_d9VkMjVY3 zM-v7|am^N2k)zgp4pCg>s5PHa$_F`W&F3)TkfUw$A&$m~qjBPB0yuh_t?#y5H@`7n z$lzL#=060|cNn>rfH}+>TpI=3@Jxgq1{wN1=UQkLxrWyr*A01yxBPF3qcX2_=pg6fpk4C1U(=C+JSlU7))_cY*E#-3_`M zbT{a3&^@4gK=**|0lgdaZqU0y?*{#XOAI$WL3lf8G~hiWdchkEH+(Kb_qtut3w}?y z(dRYvt{zYHLckkt4EhXRJm8IXIbQ$Hv*B|)UUnXMbn5RmyN7KEjx4qXhll-$x4W<{ zWZ`|s=(C9)CfYuRcZs!K2Jb)j&Ur^ zVtP6rpG(drX6BTc*|ZA(4ksqlsmaOIq>`LVPCdXDaz&^!J{ZP?78XuY8sX@8zEsSp z6}hbFS*-BLT1nQkPywox3!0o#in3NMmrE5LJW|U_CRfZrzbdEIWHlq_ic2LKsxPtv z1)P#5L!L{yysC|Bl~haShH?k#HW#pc!>nQ4teyR}?#gY1@c z*g<1~K5SURu+#h4*Y~ks-p77pA3Kx>;J9_%C-<>$?qk2Sk9~U|dlU;>f{WGZI|ZNj z@4&xZo$UtL+tt~;5T{+8jY-9lt}+RViBO(oQl?myvWk{v^ow@A!lX(mt>_AqboH!m zcw^}i%xb<;f*UBEhTDfpNljyt`58b;X(obHEKWiZHdm+?zz3cPSo&pH6V;U@$Se$F zwoJ+7XLC9raz!RTmxWtTPO5N^%9E3EImS(DG0hFhq+Be?#*IEPIm1fnw7e`Q^QF`> zlUw@c7};e&Gg&>WrV5I-jG8<#7n3Ji{S)zy%7VpeLB;}X$IkdUq*LdlTEPPO9UzQ_ zNJAOBQpu^EjkBk;ZPwyDsfHSb=#^BeqnQ}aMiE|1Rj|O`5enFdrEVi9SOhF~cZ{p3 zsjBe}pkoAlW6A+ahYcd1XIK<3SB;Mc%?oU?8Q0Nb>-0)>l+|qt&hp8xed*!TkDuTV zhP#yb_`?#t@i+o${T|4_|M6k?dji)Hi+dHv^Y;-hze$ex?H0%4^=GUx<8c7a|27&W zj_2CckjQ@_Z$jicsWNOUu`vU*?y<#@F6Nn7<#< dd)%eHMriYKx%pLSw8OuOAJO2Luyk7S{|DVa?R)?L literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co new file mode 100755 index 0000000000000000000000000000000000000000..3ce116e50612a9202ea69f635b3f9ad379333aa9 GIT binary patch literal 8640 zcmeHMU5p#m6+Rw&$M#|;_WmT^&2D0hNtck!djGO(pv5Rj0kM+B9* zaL8-M8lQX5J>R)!?tF9aoik@n%zjhwcrre44li5<4=a<%u(CDm6lS|fHv>WZkHT&c zQBBy4)BHL4DRahUfZ36n_sLY-1vlg};xrQuLjd6Vn4bq zrwUd#E4XkUGu+1~oOUjm^|di$H~||v>5uozFXyp;m^m7kPUGbF>QTf|f7g3ZUv$n; zz-{EGUPsR*zh{0_{~g=nDDQjt`2BO=26*ZXrIITeiZrj)t6JH3?yu1EnxqtJby?LL zRWtIUq|^(>3oCC(N-kH|3_~_*imLs-P}Z|b`6Wru=MBw-7bTDRk`BDc@@$pVMroOG?{c`rSY>$4+YO*7% znJ=<>TPoD`Mol)fLPe{Za=vs*%gHp-e~!bSY}Oz@TFpwiRHz32_&GfR?s-KjE7d|n zDQGW0Qq!u(X5+^$wN&zgRyRs|^`ZcmrHXP&&X*NaKB?E|Spd|f4|!HqD%uCP$=@5B zs_2?r&?*%@hWQv^Dc>tQFMi5fpUd^Lyk-A%7}8InEzvKAIjj zM>sRW_J5j(db~I*uzC22pwoUkil|qFO`0(9$k_h?vnZp`4SXKu^B75f4q_R*^a()_ zNcZX+gIzDL{O;W8=xT}2TQog!EEcyNcDNl3Idouyw*_RmSgOftqm+{;b80$2ou3+? zn9gcyDz&ZO>#0bEl6lumPRWiRX=QS1eEjIl*wk2hY$`pTPNmhnl2g*TY<6U~HtnnQ z37eBtM>&J?90+j_SR0=o|_C4=_04oqndI!W}2qIUO;?#cjz0hqKTa5Iv zVq_`&On520>J5i43M;*ycpB)-X zhKC1>P59M1F-?SDtoz}Y>s?XtqPVnYm98a1WlLNQiQ>h87_fp?aJdP;w(SG({CW^x zSdT`?W^gsMEUt>e(4qzB)-5={-h>P5qF?Yr5#_`lbW zFGj>JNWwp_#E&K5c9e?aPVo$|c!pU#11z55MBj2SxHQ*X-r)RQ3Wbov zu<=_s`xQAHL!M@A4kwUjp?wLs4JC0oy7mCJ&DtCuL0$5p1l)m=xZJnq;E2P-Z^9A3 z3D?hX#9`t$;fUXaBkm>`hesHP4>1lOwmIzhB@Vme>1BS2!|r(Yuy}~W?s)bx9C5g9 zJdDF5jKhZ*hYu5nt+@|(!LHB`ao)JeO`LzefSV7&^2XNjimVMGe^NHL2J!yyI*@>U zC}R`Etr={)$L3ZXbwhV2;8v9T?dIO5hI<5rw; zE5W#xw7KQ@C2qOn>1BS2Tkd%Fuy}}D?s)bx9C52{Jd9g$#;pY7RuZ{YLr#UfgNwe* zrT2XULQF&{q70x6pzKE3jk1Yr9$2(y;~JjC)zmK*&x>MmS%@WyuLK++pT`$+XKUflmR#Zb}SuwWhEadEwjU>j~8u#Rp5 zxPBJxeWzpJedgV#H+-45-uH!t81)gra-d&`*}5pk8W6khe)aN(_daov_AdVtMZtT* zDPzCgU4F6K3zJv8m`m`nJ;zcYko4i+!4ni<$>&S@x_reh;rX9D89%o!zy;(~`9s0m z4Wj2WT>CBDA1vXuSvSv{V{nrLH#x8)2YAjd#R~L}MhAP1c5PUO^y33Worrn6Ibz-+0CdBVHm6wvJKaTZ2=WVEbT3tC;SDQ1!K zGqRz}W)br@^;*S{Ri!E$jap5wn`oiVD}_?EfbE)`)3S|%T&m`E8FMGHg6S9CkTK4D zsjL~JMqS+&xm~%Aa9ab|zTo5_uH`?tm2*G+@S6dyapZnm_~D76?fOxsf8AyOb*BHf z18jjFWcr%R&v%$U9&ef21(oUNZr!dw$@CAn^k;4TCr$g^e+%R-rl)0i>+z9Y`K0Lz z2cq@>ebU@&OXd4_(EoS`{VO}@KiWaR7vHA=KHu+yJLu2sp#S*}`j>XlCvf8eLI5oG zoq?jD;~lhbmnZuO_3iRxm?vqQCj+Ufn;J-%3dH0hNQG)cDk?@1*gxsbI!JXrrJ})YX)0Ni zMeN2YITGY2OC};pRgg~>@zImB8a|`)#KgFq;wr70W*=BO>Nk@ z`;HO#b*O|W6PJc^87P@uYuHKu)(v*)?EC1_b$X{dit0;J#q#m*e(S!6A3Wwv2sijo z(;lVP?P&<5`#T>0ZpXXf`vYDlEUi%-&-Wo*{(u?MH&^Nl-?!|8?@xAP?Z3^Y&hdO7 z!sQ6F=j|NN)IiG|`R46>-@>IE&*SIv0AlT_mO146Brc0g&xO5A@CwSW;Ym8)zeNLr zM@u@@)_~=KXT@ Tb69AHf0RCc;c=&{(T)Ej7~So%Vx3N1+c$U~7p<)!AFnR~s7 z*P&jP#~o|t+)BZ9**?*jXB!gX-4GLd;!es9z+EOZbr0Umtz zLJx?@ChWZ3d<*@UDdW`58RhaK$g(h6|fX$CSvX?UHn`6ds0yebMAMcmH&J#DxG>uEUcJjIU3gW20+r6kS zx)Wq@3;MCs=5xyDtP|OP!?qC871cROR8Q{3&zWTfxb5+C0o|z zlvXL3{>zf0XN^niZ%Ilzt*eG18)YS>{ybaIl1kxKNy}sm)r8BEk^heBenVPT3Kdme z%9qlcwuqdpWHPG$DGJhO6hMFy^B~#AiSJbpjBmIXE{KoPc^p8ffQpjgZ?qA=+C%{9$kP1pE zTT!y=Ymb-J(usx8i7O43ysYX*UMpP`;Hp$qR^&`UG3949eTf-BUAoZEN=i}v;9mKg zlhZ{_m9uKGC?^#&m782#ep062V&=)HoX_WfE0y*9vSO+Y>uwqkZ%9VN$HQ0bic!uN z3bxzBtCI0tF{u?kINU~0(o*`WR4L^%nqIU_mT%hZ_?1*KR6|Zbhlb}NBBy{8Vd+b;?voaTt@z0r$ zrh~-@J0k3TQXJ}YVpL#pc->>S--o2vDZ&l~47}#8C%^*IUg!cIPk9_8%I{$;3s?!G zF5pwkv+4K7_k6RR;sIApyncK#YLmM(UEg;Led|6Ia@@aW;W;Pm~N+i>< zM0_@$Otu8Y$h=|JvlcyJr|pc&ZRO+TA52HlkwSB z8bdboA=Wm})}L0Qv^2UMZ^Jqc8eea!4spkl6~rClRgqdMoZ=2mWfkc-(s_rl6D2{H zuoI^`(gf18AcP!H-F9?DR<}Voe0t}?c2#h%R)sFSy?A%y?Z>+h?*Y6ItU7D`8=@0- z`Vs%4;5xi6*gBlhcWoUf){C?n25`ceKRJg4ScgEwIV9%1;J>yO0`MWc-m@1n5YztO+ezv1imRl9s$)qAS<)P%FXjc#9ewWr!sdl5zs_n`hY z)ZdMIy2nU=FmxsujC7BU&yz5G0yyzC) zRZrDZtHVz$c{jYW<$+6Ey?&Az-tgCKxpg5J5re^8Ef9$G3GgvD=Pclt$!MhtJxsl`vx;Vc*@NQ?FKqRm_rBzUq0MR2IeFJW!M;&2{ zN*&>WKBl*?g+2XDZ+{DW4lvw-7WN!uxP$xI<6-vrm^~h5k1ss%zsya5*%D~+4nxeA zp%(8j!f+!k-eHvCM)!M%UiJ=y>>Ya9I}C;ghMC^s7WN!sdJna*C&+NY7WRxW+}M8h z3^98~nLR_yp3(3?&Er{Htk<@=|E_tx=)hh+ zegrlb@!S6DD)}gVzw}Mh_j}~)VHnzc3frDuB_E9;Pm;qhg*2K9!wAEXkH#5pn&Czn zj(l{6;bs|*zW1ZdM`O%K4pK%K*Nc_YjU3((W;emvL-MSn)E5yy4j}j3FI) zkm99{qj)*aaQ8FZAj46-q&Q1BinD~HaSO0`8D#NtoW;u{R=l+JQoL-or=RJic-d^v z0cH>RTC+U|8IIy*OM6(n46=AR&f?_}jF)BfE!X^&_g%+@fQVFlSO~b0x*rn)T}Zo5 z34uDUe^8~h9oPB@uC$$E?wlw_YC<4FYb34n09zk8aUU>G6gMTb_5yr=n=m!RI68&g zIRP;)cf{#Id4K88PTZN)MQ_g9*kB$13**}9$2Qz#U>()CaV^bR`%_!K>-4*1?=RnX z`Gf%V;l{FiPzYGOUbNLMc0Ks|)otf-ah3Ks{}4sN`LJEae!Dt4#V#kruQ_pJ)XAavj1e-+Lx>c1wWs=vRdg1!otCj!$qOP{qB&8opcg?tOO* z?)t!8AGmEF;5ol2E8K(OjxP2$+|HF^{W@#s=}io>*7eU%>KYYZV(l1qt?MdR%<2t- z$-YpEaxWp476`>+QdElK|JcxY&NR!$0}~Ti$yJh4N-Iu0p`=x5a~NY_%wW+l`Rip@kPD1Iz zYD!7YC8zI)g?tI~j1Pp!poN8#%#6_ZL_sTMRb4JCW{&bZvZ2Xl4)Z^HxoF5Kr6e1b za#_<<?Wp*?cLB?W&wsla;KTFJ&|tb1$-j=^M?E(auc1pc)f~p4wx%Rk;mw8w1$7 zVCOJy=1;hkJ)fO$b%=9p8nj3!tc>jCPcZ&_P4Zu0{J_{=`7bg4g(f}UV*G{BUimEJ zuY6`N|GSJ|ZsPyI;(t`9T*2<^s#W=@Zsi(wHQZqQCUfARH9#NLcZpiXQ~U61`|#i1 zhks)q{wV(E2Dr7~$M@l%*@u5=AO7Y({BhjSfZ&E5c1~fvEIxjR@~!f0H;~^d&*nuv zt@3Ojl{8ZY2~&xfoCGObsz^D-$N_s2-_$|UwX|X?AerilX-QM!5!q@%*YE>Mr}6Uv zDQOrW*~b7<%CHfvN~uXq!RCvVBFbPypwur=M$}Z2NORbYU9u_2&*V)+qDQ6B|uB3Z+p-ACibaqR9;r}t~;`@{yto^HA*EpWrj4^1Pkn85{qH+vmc(o$p^b zZN_u^Ii+RM3b{-n-!E~R;~g<-Ef>3ZpT#oqc>h%z5WE`Ht~Q?YjgnoUb(`>wZ5JOH z_M(mRY>sO(fR+iu(mT+_*A0G-T(;Sb{U9GdzW#n5+lj~V(HR@S&%OW1&U2U=%T`_{ zdJ6A7_($wkj&Cye7W@+^-Yi2JINsKF^Vuw`U{f=Xq}#8BGOja>QmJZsa5+Dsfu{%V;@qrX`izF-*k64|`7a?R%XZ<%v20VIrA3=CW!d4|JPuu8O!6=IoQZP&KHrul zTNKCpI=@}4A6Z%!eUlA0iwcR0_V_&d)Y4LTFN~Rp3t&Y(?a6lKxa|FlYw&TYx1Vxb zeG75Cz11$fE&hq~4lh&wU2s{-ZN`WHHr7Q@KKR70ukZN|W6xjL@rlik2?gYO+wWT=ru{v(wJY)7KTvFl^ItRHvvV&8z8LLDzDxD1}tYF}a(Qoiy98 zDdSqMWU7a=mhpz~qUxno%CtX4p#4(?o|kQP_^vIB+nUv%RlSyLUK{K3>ii_H_ms3< zC>2%5Oy^C@Ra4ms(@^>EWH-F=!pPKCYPoFMYX9jSoV#y%Rmo{qx}>Ge*>4t2>*0~e z!*AD+I&RudwqVT!*i9v`O{l4y=Bm#W?8BmYDEzJkBW!7T^ZonO1F^w;!Bo>`K94S0 zT28f1qm(pNJ=Hh3XZ+i0HkU>B=d(%Gakb>(Ut-JEIxzOOZ!GS> zo60lVxS6sG7<4{=_tpUvF23+4&d!P#%;KKRAS z^N(xSezf~<^0?)llCg5?;$gL=u3=|&d12!)Rn26JYO<6y)WK9DWg0_6eeu4Sp6DN{ zHMyFKlFqt2xd9BW-y4zF8R4zek*_2tm|^)rFc$#SSQG_yVwE{Cyx-P#wQ4_DWog*XdW zLo>V#hJxj`wQW-#k1(I3Y!9`U*Q{MLRb}71#aMs^S>;w6d+FA?@C)T?>jp2j%D(Td zYiDO}tzj>F>snvjbb6Wvx~5r6S5tHFDC;@47#4(PV6wTJBYoJ*qy}Q zLhJ~!yNKPrU{?cy3_tON*M;WV+o$Jhecry{gxGr) zY}x0@yH3qj+Go3}?XPWRtXi#vUhk@guKV@yTV2)gJAQrryIs}wH~f0X&8}+4&%FA+ z;dlPH@4Y~0Wdqm-pX~?R&)G(>jXoO%8|7>h*e0JH06V~0Cs-%Z+YGju=xqVpLiFwd zyNBpSz#>HN%V1w7dR<^$L~kqDR-)Gp)=l)bfo&st+rhRIJq1i5dQq?_(Tjn_h+aQf zKhYZm8zg#husG4%3AU5y-4Av@(R%>w0irhyHca#$0(*$)?E>3H^d13wgy=mA_9)SN z4D2zYHv%?7^mc>oCVG3o_7J_jV0($)KCpd6Z$H@nrFvEN!Yx+D-p`KT3bqBBTW5oT z*4JhO2iaf$(ERFV*7`c^>$8EUr2Q7`cV+`mOZ#2eH)aE>uvy2E+ku0v>SZM4|bmDT>!g4^e%#3Bzl*?E)l)UV3&#B6|gHr?<&~U zrFx&)@!z-}Xj~68t_M0RpY!X1&+Pbboc|l=|Hk>hv(h;K`}Z&!_X8UD1Ni-q#{GcK z%ID~QK;!)1`2SzKU)lKo-&tw=|NrmY|N9^K|L;8)h%~Xra{+RnXe;g$ZC!ewsL~DA zcrNgN`&^)L9kAN#fKfaLII!?s@JU?T>R`Ihyyt?P8DNIbyz?w)DX^5!yyt?PWxz5- z?-1A_qL%~95j_jcB6>xzBGI$KY@+9axkPUqY@Fy#fK3p+Nw7(xcLeMR(K`xul<1Yg z%0zDpY>MbjgH02?<6y^$-U+Z1MDHZnNuqZO>=e;E4R)I7Rlq7lZw73J=$!#OL-fvq zoh5qbz|Il9^I+$R-UYA=MDHTlMWS~J>=Mzt40f65T>-m7^sa(kU8?6F|D%EJ{JGwf z@;TsGpoiNUZ0|YXSU{1s_Z)C6&?{{3IpFq%=YYq;;qpiSoSG}*nLA_0@F_PwjWKBK z1C4#a+XsS3ZQ@JROA2Oet#2}(v&9#I*=CU)r(b?JC3nNp~m1H5`yH_*Jg5_mu7V^cCYxd@H8S- z$7-H_<%eGJVU=L^Yr%;O`*;QaL2n@L)XA^FFRZ}-v;tr6WwG9=|IiA2as_^R1%7!2 zelJf>4YafQk)mnj+F*Tb~krr>Nv1nY4GF8`AtDt(vC=PgkwPnb)&t!1i zsk(^+RE_l~)IO&A$mBjgSfUWGGmFu)ekKReznw!kj8He4E=c;=>!*K9y^;JaLX0qg^ z@~ z@oA60o8nDFDCv7G@;w%*FVFYV=CS;oBje?Dg)GN}BfmW0ZSkCOzR3CgF4RA&Rtqv- zURTJn+w)$~m+=C#FQUS=WxTu&ktL0n{g>sJ5bI5~LQr1!$WmVSh}GV2kab@2_Z9f> zlIpi-O0?>oWO1gnVDC_Fk5ZTXqyBakC)vvLF=-H+8#)mKE8Sf}&7c&rLEXalI_fvLH94NJ+FM zQXnbC34%m%-OLNpKylM-&C?#XwL{Pr*%w-Dh>bmUzy|E&se35$vd29X*oF-ork%@s zuWX;%jWVpju!(?t?)lww{=B;6B`Nw`{NnhS($<#lAbe{l_eh&)Gm{qWn#vpFos1>P z2Kf6B*+N2qS#Fwlf6V?>h)LTm;ux0u6lk^S60Wp8+-G?(1VSYLEaybnrH8pMEm@Jr z#=5jy^pCB!CGL{+2GKxyvBx-%y|~(n=;b*Rc?xtivp>0CIWLoc@C`OE&5lzZs}~@T z?e9T1wlDTcvH`DQ|0KAy@|dydzXyF$u+JPCeSYFAguL{&Ry6XSrcRr#Z5F)iACj%N zRV`O?X~rqrKKZVyxjFB(xwlo#FkI8~s8`Z5=J#?1N7o8(s7^NPnJmn+&Y9h})rwXq zn{?W;jkkCR)yvte>E1`6`@RA%$$jB0>>d<;)d4eXYen;eN9l?9aM3Yo&MX#TNVZm>u4$Ar zCe^cp!xNP+Q_He`s+L@6cqOY)5QqE?)jM0%ox%tGP3=pa zHr`dswv~0E)-UEhx;OyPACzqW`CF1gAmNVKWRsx*xYiw*CZf%EEA+AGL zKhJ~MW#ggt5Lpt7GtDuXV4Q@Byq?H)%&h$EV6oo;w@T8C{c`ccpIy22CGE)4^xx&W zWrJ#GacgXWW>eEnnu>yhLnzH#C7LN)1|1n3iKRvogM+c5cs!X(tucc8BQxq$1klQKx(&GBNR_JE{{(`(i=3v6) z^|sm;S_)#7&=OBiTvoYUsseoyQxGrg%{^T zbBm2J^)1H)W9mo#lgN)DKY{!t@}EL}dfAs_n%Q!7vDQ7`SMR>En~-|F7QWh74`1W$ zNUg6PS>Wx>H~Q+EZ}N7}xBBWmw}SR)`Uk%oeO>9TZ3VWKvoT;}jBNw9jkEp0_A|B} z*mlki0z1f9FR)&uw*%M?q_-2;PNcUB*e;|O1r|kmj{tiF>Gc8YLwdV`?M8b2!1|Hi z6TqH8dQSp-66vYHRHPRJ7DIY*U~#0E0G2>{!@!1-UJ_Um>Fooy59vJx>?x%8G_a?U zUK&^$={*bVS)?}#Y!vB@0UKMbS0^vuB~>`P$mP4CE~UM5KBRPBnODZiAAi&SjUA-( zD)`stl|$06fxj@X9G3nK@Ndp5M}$v$&finUJFE4Mj$mCi*3*B%8V}afeppZY#d*=6aPvfwj#>IL%4D0EzSWgo$XA@|?CxK0(`92ElD4OqMz>ck+@7i%-$C2I%U?-5? zNnj_D-YHkQ3dY6G+Uaj}3 zt)JHYsC7S*_w)bZ`_ZSiep>e*zJ6NwpWa&Q{`26^Z>{@J>;BWa|B%ks@1NdU>-SG@ zt@Zn-xAuSb`zLrWcaM75659_d{Gb^+;CfmM;-EU;OmHwSDE>0JhPd9@y2Kl_zF_Fnpce2+b$Jj(nR zzz?3|CY0UM58h)>D31w0c#qw;{2u#KBvP&XZFaE)|C1r)5`3zyPiqcZ=RoTmX!;xo z5x&KOqA)@>3ZMPYo^1k?d!iBMm=uKl^FQkjYt}@6hv4tZMBz6K6gx#TtO6%03PYmt zxEfPqP{$DM&-;GK+cz)(PQI+G8K*cfsTrnY2bDTbu~hcWfkL5D+*5L$DKq0oT_>|= zLqyVSWFvCC5DIN@syrSjHI-Yq+voiL}IUz*}FHhSJPAa(Bov>vZ2mu zUz81Ke0)?4BicVuaO|Av(vs%qS(Oj<9O~zx3dk)LJ(|&M>Xl0+$Mqp1Gp*$;I|uzH zHB7ynqn4d@C{*824N7<%kHR>!R>AZJJU6pyVGf+HF7 zk9Fb`!9;vi$2!yN@LTKf->$KZ5wR z>ZX1Ve66~vcF& zvV6#}Y(lLpg|bMK#$!nuBUIO^?a*M24iE8O+mM}S^04fvZo>Mc@kELa63UUuVmMeS zkZQ7q%PtowtC)L$a8|+De7J5x2_a=wfz9UxlVDI?b6v}9Hg82$A}^$Du81wDA*^IF zO^z-I52`4=jH~&Xe3N-_^07+AU}P{ItXA;>GF>xM4hlIRAP351Ey(lX8eb?7R@Yc6 z2Q{FLLylN*ToG&jxHUPl`_v55(XSqU=FoxXR$mqVeUDxCSZ#7}83K#znq?ic+=IN| zOP}SkdydSP&lS?17J=;XfbEOb46;pL-?u>fFZH@3^W}4ewEaQ!vcAk0n0yWzOk3v5 z=MZUez8t@_4?}KnsTG3qxkp<0+#|Y!?;zXEvU*kaz2yF(urb&gmb;C==|+n;2Eed< z`CQhxpRfx|)?`7iQbnUCYKKMCQOF$I~=)t3L_SPi-`W6{m#S44h{C9T+GehP_8 zSK5n^Bai)xbW##0-f_{%=6gGQ Date: Mon, 25 May 2026 08:08:37 +0000 Subject: [PATCH 2/2] align to pyhip tag `PR_aiter_3309` --- .../hunyuan3_fp8_per_tensor_tuned_fmoe.csv | 36 ++++++++-------- .../qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv | 30 +++++++------ aiter/fused_moe_asmjit_aot.py | 40 ++++++++++++++---- ...nt_type_w=QuantType.per_Token-dyn=False.co | Bin 0 -> 19032 bytes ...ant_type_w=QuantType.per_Token-dyn=True.co | Bin 0 -> 11816 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 19384 -> 0 bytes ...nt_type_w=QuantType.per_Token-dyn=False.co | Bin 0 -> 11448 bytes ...ant_type_w=QuantType.per_Token-dyn=True.co | Bin 0 -> 11816 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 11800 -> 0 bytes ...t_type_w=QuantType.per_Tensor-dyn=False.co | Bin 0 -> 20048 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 20408 -> 0 bytes ...t_type_w=QuantType.per_Tensor-dyn=False.co | Bin 0 -> 11824 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 12176 -> 0 bytes ...LOCK_N=1024-atomic_write=False-STAGES=3.co | Bin 13624 -> 13624 bytes ...BLOCK_N=1024-atomic_write=True-STAGES=3.co | Bin 13944 -> 0 bytes ...nt_type_w=QuantType.per_Token-dyn=False.co | Bin 0 -> 17824 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 18136 -> 0 bytes ...ant_type_w=QuantType.per_Token-dyn=True.co | Bin 0 -> 26184 bytes ...nt_type_w=QuantType.per_Token-dyn=False.co | Bin 0 -> 10920 bytes ..._N=128-quant_type_w=QuantType.per_Token.co | Bin 11272 -> 0 bytes ...ant_type_w=QuantType.per_Token-dyn=True.co | Bin 0 -> 14224 bytes ...t_type_w=QuantType.per_Tensor-dyn=False.co | Bin 0 -> 17688 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 18056 -> 0 bytes ...t_type_w=QuantType.per_Tensor-dyn=False.co | Bin 0 -> 10848 bytes ...N=128-quant_type_w=QuantType.per_Tensor.co | Bin 11264 -> 0 bytes ...TILE_SIZE_N=64-quant_type_str=per_Token.co | Bin 8456 -> 8456 bytes ...ILE_SIZE_N=64-quant_type_str=per_Tensor.co | Bin 9992 -> 9992 bytes ...ith_silu=True-quant_type_str=per_Tensor.co | Bin 9160 -> 9160 bytes ...with_silu=True-quant_type_str=per_Token.co | Bin 9160 -> 9160 bytes ...th_silu=False-quant_type_str=per_Tensor.co | Bin 8640 -> 8640 bytes ...ith_silu=False-quant_type_str=per_Token.co | Bin 8712 -> 8712 bytes ...ith_silu=True-quant_type_str=per_Tensor.co | Bin 8640 -> 8640 bytes ...with_silu=True-quant_type_str=per_Token.co | Bin 8712 -> 8712 bytes ..._gemm_final_reduce_bf16-TOPK=10-OC=4096.co | Bin 14544 -> 14544 bytes ...e_gemm_final_reduce_bf16-TOPK=8-OC=4096.co | Bin 13008 -> 13008 bytes 35 files changed, 66 insertions(+), 40 deletions(-) create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co create mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co delete mode 100755 hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co diff --git a/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv index 9b46beaee0..917e03da47 100644 --- a/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv +++ b/aiter/configs/model_configs/hunyuan3_fp8_per_tensor_tuned_fmoe.csv @@ -1,19 +1,19 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag -80,1,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,15.358300000000042,fused_moe_asmjit_aot__16_True_False,5.93%,0.0,,0%,15.358300000000042,1,0,2.46,29495.26, -80,2,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,30.061942528735862,fused_moe_asmjit_aot__16_True_False,4.43%,0.0,,0%,30.061942528735862,1,0,2.51,15069.2, -80,4,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,42.2820199999999,fused_moe_asmjit_aot__16_True_False,3.32%,0.0,,0%,42.2820199999999,1,0,3.57,10714.58, -80,8,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,62.29127999999991,fused_moe_asmjit_aot__16_True_False,3.73%,0.0,,0%,62.29127999999991,1,0,4.85,7273.62, -80,16,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,94.30098999999973,fused_moe_asmjit_aot__16_True_False,3.65%,0.0,,0%,94.30098999999973,1,0,6.4,4805.69, -80,32,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,131.00703999999988,fused_moe_asmjit_aot__16_True_False,3.47%,0.0,,0%,131.00703999999988,1,0,9.22,3460.72, -80,64,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,226.26340000000016,fused_moe_asmjit_aot__64_True_True,2.21%,0.0,,0%,226.26340000000016,1,0,10.68,2005.5, -80,128,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,242.56287755102025,fused_moe_asmjit_aot__64_True_True,2.52%,0.0,,0%,242.56287755102025,1,0,19.92,1873.98, -80,256,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,249.7409595959585,fused_moe_asmjit_aot__64_True_True,2.73%,0.0,,0%,249.7409595959585,1,0,38.69,1826.41, -80,512,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,268.0743711340202,fused_moe_asmjit_aot__64_True_True,2.53%,0.0,,0%,268.0743711340202,1,0,72.1,1713.24, -80,1024,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,310.66775000000007,fused_moe_asmjit_aot__64_True_True,2.35%,0.0,,0%,310.66775000000007,1,0,124.42,1498.6, -80,2048,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,564.0819578947369,fused_moe_asmjit_aot__64_True_True,2.04%,0.0,,0%,564.0819578947369,1,0,137.05,847.66, -80,4096,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,906.8664583333366,fused_moe_asmjit_aot__64_True_True,2.14%,0.0,,0%,906.8664583333366,1,0,170.5,555.01, -80,8192,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,1658.349010204088,fused_moe_asmjit_aot__64_True_True,2.20%,0.0,,0%,1658.349010204088,1,0,186.47,333.86, -80,16384,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,3156.37568686868,fused_moe_asmjit_aot__64_True_True,2.19%,0.0,,0%,3156.37568686868,1,0,195.94,207.3, -80,32768,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,6262.433448979595,fused_moe_asmjit_aot__64_True_True,2.24%,0.0,,0%,6262.433448979595,1,0,197.52,136.63, -80,65536,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,12425.184968085092,fused_moe_asmjit_aot__64_True_True,2.28%,0.0,,0%,12425.184968085092,1,0,199.1,101.27, -80,131072,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,24933.576175257756,fused_moe_asmjit_aot__64_True_True,2.28%,0.0,,0%,24933.576175257756,1,0,198.44,82.76, +80,1,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,15.986867346938894,fused_moe_asmjit_aot__16_True_False_False,6.03%,0.0,,0%,15.986867346938894,1,0,2.36,28335.58, +80,2,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,31.46978571428563,fused_moe_asmjit_aot__16_True_False_False,4.33%,0.0,,0%,31.46978571428563,1,0,2.4,14395.06, +80,4,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,42.93495959595968,fused_moe_asmjit_aot__16_True_False_False,3.25%,0.0,,0%,42.93495959595968,1,0,3.52,10551.63, +80,8,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,60.95569000000001,fused_moe_asmjit_aot__16_True_False_False,3.65%,0.0,,0%,60.95569000000001,1,0,4.95,7432.99, +80,16,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,90.33211111111105,fused_moe_asmjit_aot__16_True_False_False,3.68%,0.0,,0%,90.33211111111105,1,0,6.69,5016.84, +80,32,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,138.59771717171733,fused_moe_asmjit_aot__16_True_False_False,3.49%,0.0,,0%,138.59771717171733,1,0,8.72,3271.18, +80,64,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,223.5444946236561,fused_moe_asmjit_aot__64_True_True_False,2.21%,0.0,,0%,223.5444946236561,1,0,10.81,2029.89, +80,128,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,240.1024555555547,fused_moe_asmjit_aot__64_True_True_False,2.52%,0.0,,0%,240.1024555555547,1,0,20.12,1893.18, +80,256,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,247.60494117647124,fused_moe_asmjit_aot__64_True_True_False,2.73%,0.0,,0%,247.60494117647124,1,0,39.03,1842.17, +80,512,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,284.450091836734,fused_moe_asmjit_aot__128_True_True_False,2.53%,0.0,,0%,284.450091836734,1,0,67.95,1614.61, +80,1024,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,307.1181894736858,fused_moe_asmjit_aot__64_True_True_False,2.35%,0.0,,0%,307.1181894736858,1,0,125.86,1515.92, +80,2048,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,576.2535684210527,fused_moe_asmjit_aot__128_True_True_False,2.04%,0.0,,0%,576.2535684210527,1,0,134.16,829.76, +80,4096,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,1083.558147368421,fused_moe_asmjit_aot__128_True_True_False,2.14%,0.0,,0%,1083.558147368421,1,0,142.7,464.5, +80,8192,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,1696.2595900000003,fused_moe_asmjit_aot__128_True_True_False,2.20%,0.0,,0%,1696.2595900000003,1,0,182.31,326.39, +80,16384,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,3294.693249999998,fused_moe_asmjit_aot__128_True_True_False,2.19%,0.0,,0%,3294.693249999998,1,0,187.72,198.6, +80,32768,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,6371.982838709681,fused_moe_asmjit_aot__128_True_True_False,2.24%,0.0,,0%,6371.982838709681,1,0,194.12,134.28, +80,65536,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,12793.395297872323,fused_moe_asmjit_aot__128_True_True_False,2.28%,0.0,,0%,12793.395297872323,1,0,193.37,98.35, +80,131072,4096,192,192,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,16,0,25726.181630952433,fused_moe_asmjit_aot__128_True_True_False,2.28%,0.0,,0%,25726.181630952433,1,0,192.33,80.21, diff --git a/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv index 46671db841..4635279cbe 100644 --- a/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv +++ b/aiter/configs/model_configs/qwen3_5_397b_fp8_ptpc_tuned_fmoe.csv @@ -1,14 +1,18 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag -80,1,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,13.771117021276607,fused_moe_asmjit_aot__16_True_False,0.42%,0.0,,0%,13.771117021276607,1,0,2.28,58478.82, -80,2,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,34.621979999999716,fused_moe_asmjit_aot__16_True_False,0.54%,0.0,,0%,34.621979999999716,1,0,1.82,23260.68, -80,4,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,47.58002000000011,fused_moe_asmjit_aot__16_True_False,0.58%,0.0,,0%,47.58002000000011,1,0,2.64,16926.34, -80,8,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,67.1539799999998,fused_moe_asmjit_aot__16_True_False,0.76%,0.0,,0%,67.1539799999998,1,0,3.75,11993.4, -80,16,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,101.03224000000053,fused_moe_asmjit_aot__16_True_False,0.24%,0.0,,0%,101.03224000000053,1,0,4.98,7972.73, -80,32,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,142.52629999999968,fused_moe_asmjit_aot__16_True_False,0.26%,0.0,,0%,142.52629999999968,1,0,7.06,5652.99, -80,2048,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,604.5281000000002,fused_moe_asmjit_aot__64_True_True,1.68%,0.0,,0%,604.5281000000002,1,0,106.57,1373.75, -80,4096,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,996.1099100000009,fused_moe_asmjit_aot__128_True_True,1.81%,0.0,,0%,996.1099100000009,1,0,129.35,858.98, -80,8192,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,1717.7659090909092,fused_moe_asmjit_aot__64_True_True,1.82%,0.0,,0%,1717.7659090909092,1,0,150.02,527.41, -80,16384,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,3009.697379999994,fused_moe_asmjit_aot__128_True_True,1.82%,0.0,,0%,3009.697379999994,1,0,171.25,334.46, -80,32768,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,5728.849888888895,fused_moe_asmjit_aot__128_True_True,1.83%,0.0,,0%,5728.849888888895,1,0,179.93,210.86, -80,65536,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,11492.974020618547,fused_moe_asmjit_aot__128_True_True,1.85%,0.0,,0%,11492.974020618547,1,0,179.38,140.14, -80,131072,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,22660.053193877586,fused_moe_asmjit_aot__128_True_True,1.85%,0.0,,0%,22660.053193877586,1,0,181.96,106.62, +80,1,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,13.677915789473445,fused_moe_asmjit_aot__16_True_False_False,0.42%,0.0,,0%,13.677915789473445,1,0,2.3,58877.29, +80,2,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,35.211979999999954,fused_moe_asmjit_aot__16_True_False_False,0.56%,0.0,,0%,35.211979999999954,1,0,1.79,22870.94, +80,4,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,47.743629999999555,fused_moe_asmjit_aot__16_True_False_False,0.58%,0.0,,0%,47.743629999999555,1,0,2.64,16868.33, +80,8,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,63.788901098900865,fused_moe_asmjit_aot__16_True_False_False,0.70%,0.0,,0%,63.788901098900865,1,0,3.95,12626.09, +80,16,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,92.46276842105215,fused_moe_asmjit_aot__16_True_False_False,0.24%,0.0,,0%,92.46276842105215,1,0,5.44,8711.65, +80,32,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,135.92059999999984,fused_moe_asmjit_aot__16_True_False_False,0.26%,0.0,,0%,135.92059999999984,1,0,7.41,5927.72, +80,128,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,355.59850999999986,fused_moe_asmjit_aot__64_True_True_False,2.37%,0.0,,0%,355.59850999999986,1,0,11.32,2269.07, +80,256,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,396.20228999999995,fused_moe_asmjit_aot__64_True_True_False,2.33%,0.0,,0%,396.20228999999995,1,0,20.33,2040.5, +80,512,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,411.33022448979534,fused_moe_asmjit_aot__64_True_True_False,2.01%,0.0,,0%,411.33022448979534,1,0,39.16,1973.11, +80,1024,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,446.35456701031023,fused_moe_asmjit_aot__64_True_True_False,1.87%,0.0,,0%,446.35456701031023,1,0,72.17,1832.38, +80,2048,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,576.2807849462362,fused_moe_asmjit_aot__64_True_True_False,1.68%,0.0,,0%,576.2807849462362,1,0,111.79,1441.09, +80,4096,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,993.6361145833362,fused_moe_asmjit_aot__128_True_True_False,1.81%,0.0,,0%,993.6361145833362,1,0,129.67,861.12, +80,8192,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,1695.620755102034,fused_moe_asmjit_aot__128_True_True_True,1.82%,0.0,,0%,1695.620755102034,1,0,151.98,534.3, +80,16384,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,3004.3378800000028,fused_moe_asmjit_aot__128_True_True_True,1.82%,0.0,,0%,3004.3378800000028,1,0,171.55,335.06, +80,32768,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,5748.212340425532,fused_moe_asmjit_aot__128_True_True_True,1.83%,0.0,,0%,5748.212340425532,1,0,179.32,210.15, +80,65536,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,11247.049708333347,fused_moe_asmjit_aot__128_True_True_True,1.85%,0.0,,0%,11247.049708333347,1,0,183.3,143.2, +80,131072,4096,128,512,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,16,0,22489.665710000027,fused_moe_asmjit_aot__128_True_True_True,1.85%,0.0,,0%,22489.665710000027,1,0,183.34,107.42, diff --git a/aiter/fused_moe_asmjit_aot.py b/aiter/fused_moe_asmjit_aot.py index dc9884b733..6ec3f73bf3 100644 --- a/aiter/fused_moe_asmjit_aot.py +++ b/aiter/fused_moe_asmjit_aot.py @@ -18,6 +18,7 @@ class Config: BLOCK_M: int use_down_loopn: bool use_prefill: bool + use_dyn_sched: bool def to_string(self): return ( @@ -26,6 +27,8 @@ def to_string(self): + str(self.use_down_loopn) + "_" + str(self.use_prefill) + + "_" + + str(self.use_dyn_sched) ) @classmethod @@ -36,9 +39,10 @@ def from_string(cls, data: str): def get_tune_space(): return [ - Config(16, True, False).to_string(), - Config(64, True, True).to_string(), - Config(128, True, True).to_string(), + Config(16, True, False, False).to_string(), + Config(64, True, True, False).to_string(), + Config(128, True, True, False).to_string(), + Config(128, True, True, True).to_string(), ] @@ -100,7 +104,6 @@ def fused_moe_asmjit_aot( ) if kcfgs.use_prefill: - BLOCK_TILE_SIZE_N = 128 sorted_ids, sorted_weights, sorted_expert_ids, num_valid_ids, cur_out = ( moe_sorting( topk_ids, @@ -121,9 +124,25 @@ def fused_moe_asmjit_aot( quant_dtype=w1.dtype, num_rows=None, ) + if kcfgs.use_dyn_sched: + dyn_buf1 = torch.zeros(64, dtype=torch.int32, device=hidden_states_q.device) + dyn_buf2 = torch.zeros(64, dtype=torch.int32, device=hidden_states_q.device) + grid_gate_up = torch.cuda.get_device_properties().multi_processor_count + grid_down = torch.cuda.get_device_properties().multi_processor_count * 2 # occupancy is 2 + GATEUP_BLOCK_TILE_SIZE_N = 256 + DOWN_BLOCK_TILE_SIZE_N = 128 + else: + GATEUP_BLOCK_TILE_SIZE_N = 128 + DOWN_BLOCK_TILE_SIZE_N = 128 + dyn_buf1 = None + dyn_buf2 = None + grid_gate_up = N1 // GATEUP_BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0] + grid_down = sorted_expert_ids.shape[0] + hsaco.fmoe_asmjit.moe_2stage_gateup( - [N1 // BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0]], + [grid_gate_up], [256], + dyn_buf1, hidden_states_q, w1, gemm1_out, @@ -133,14 +152,15 @@ def fused_moe_asmjit_aot( hidden_states_scale, w1_scale, B, - N1 // BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0], + N1 // GATEUP_BLOCK_TILE_SIZE_N * sorted_expert_ids.shape[0], weight_dtype=str(w1.dtype), TOPK=TOPK, K=K1, N=N1, BLOCK_TILE_SIZE_M=kcfgs.BLOCK_M, - BLOCK_TILE_SIZE_N=BLOCK_TILE_SIZE_N, + BLOCK_TILE_SIZE_N=GATEUP_BLOCK_TILE_SIZE_N, quant_type_w=f"QuantType.{qtype_str}", + dyn=kcfgs.use_dyn_sched, ) gemm1_out_q, gemm1_out_scale = quant_func( gemm1_out.view(B * TOPK, -1), @@ -152,8 +172,9 @@ def fused_moe_asmjit_aot( B, TOPK, N2, dtype=torch.bfloat16, device=gemm1_out_q.device ) hsaco.fmoe_asmjit.moe_2stage_down( - [1, sorted_expert_ids.shape[0]], + [grid_down], [256], + dyn_buf2, gemm1_out_q, w2, gemm2_out, # cur_out, @@ -171,8 +192,9 @@ def fused_moe_asmjit_aot( N=N2, with_silu=False, BLOCK_TILE_SIZE_M=kcfgs.BLOCK_M, - BLOCK_TILE_SIZE_N=BLOCK_TILE_SIZE_N, + BLOCK_TILE_SIZE_N=DOWN_BLOCK_TILE_SIZE_N, quant_type_w=f"QuantType.{qtype_str}", + dyn=kcfgs.use_dyn_sched, ) num_WG = num_CU * 4 num_tokens_wg = B // num_WG diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co new file mode 100755 index 0000000000000000000000000000000000000000..f625780018cbf670b3e12bd21a1ad374c63384b2 GIT binary patch literal 19032 zcmeHPeN>z0b$<{NA1`18wip75NJi$v0a*gXmob(x#+b;25aJMrSm+CcEF>Zc>~sl! zL$KXsiQ~9wnr3O5lYV7Qn>Ov1wQC0~?df{9t}Dq-=d3+hPTF-jTj|pEY^VP$+V8&4 z`+^WS4tCa_ofG+(pMLkb_dbuL_qp%Ax;&gYZz%2Yuu=VXT^Ff4#S^y^V<;9NnL zVn=Hg_sZjUuE!}(nW!u(&F&r9?wyldQ9;>k?LCw2R2Xc35_q$Hz zv$_R#G~ZSqG#CAEr3G(-Z_Dq8f-+^<&|L;G0S_GQK0I(6V-LLOne+$a9_z_KED{LE zAA5rpzhL!@PsQB6=yW78`Z=p7HXeU^{za?D?~euIad&*m;|qL#JRJ3U!e6jP$HwA; z1pAydKJk%2&I{HVPk1`uJ~vd3by$)>G z>!9^|9XPMoLGM)^JfK{216);|dd)rWy~izGfvZ|W@1^xk>v$|WJ>`xE#wP=jgnMk_ zbinVXEBWsR*vrzMz}3Gx>B*+52426>ecbMtj0W5d@dW(2{n1mA zftg#~{NKQsgf{oBHuuEDx2#jKi5X8Ku)4~%7r|bz##eW8)k~bjrzXN-zPZ^Ktnqs$ zz0vUN)-B9vJ?a03bviOJ7L83xFTuU`8Zy3aosI|MZvQ=aArn3~*6%y{3O=FLW&Ggn zmpDsZ$5*VUJTrl@SQMWDy_wE0%TXzTw?B~hCfK+yQV*d22v_y><721Wni`borM$fv za!5pk+xVckTbzj~DBpjktX!uaP?VX)mgIo$vm7<@QZa_}%uVhKANa zz}wvH_cb+-`PQ#t3(Pt`k!bfcG};|@pWW8j+U)T*w6!#~_#2v=yk1|c$LDBnZX9c{ z+gsWiH}gBdg~RfgQ@;OhrfP=Jknt5Zk2g-a--;Po{G(E3Es?1REG6U2tR)k1U=49~ z){>1lu$8zbYspC**b8)KsFr9c=a|Zr!NkQ3jcs<3sjAK_J-qm!TD62B_K;@#r6l|0 z0uvcavZofZvQ?QXvHa2UIm3k^OB<2?x z!~)ScSN>rB1u;Kg+=l;!V!@pDW7{s|Y}+R0iiLC93pp1`a&p8!5Oe1qRjI4y)M^XX z99&Z81erMS(@bW_BvdnlI;j?dSPDz3^Mgx3l53bjLzu-3SxNP_AYE)pwKnJkl02Ik zvI%pTAt$LW4vNyaI=Cc{=Q2YsjjNbJHC6cVVlrz-RWg}m$sHwHO5C2zDm9%iEwkj6 zmjvfkWv2Nu{ydpu;ChA%u9ssnoj2{Y6joLQ3(G5lNns}+-)Z6FLY3)!RgI;%))Fi( zw*-^KmCrX6@w z5VFO5k{cj587%q6Qc0elpDk`9`A*2IOqOkCgCyU!EnC!*yaw_*yg`*FNv_pqi$x^2 zLf%kiDXQKn$%~4z#qA_-g1oiHvc0xSl5gLhE$$$BJLH{pmL2sql6=RGY_WvoPRP5h zmXbYnlDwoOTTHS~NUtl4J-(35K91L=xuiL-xu6kCE->xLn|bW4=poL4k)UeL;ULC0CF?aBK@T(%+8xrI#j z(ZxAdVxem@kLBlOWxTr}U7s6t$}EhPT`tH&{tlgs7$^G!WUnf*B(ut-&$$5a$3tZ* zT+dmig6ymyQ)L?(pIBq`lo=}WJG+WHp}6w#{Ze zsd-fMP}bs7=KP{^J#54EP>Aax$)3RVk;6*=NONIcrM76)YQ&tg3-hWh(|lHzMa_zr z&mpkVd!+1<9PE)??2#mUW`Q~J0phw*InQd68jZ-*GVfmFlAMAz>GiJ3CGJ||sd|~} zY>^d>yqQ^wYd8y^A6}cJKd`1R@u4-IYOfkzlS_IdYdlpi(;&}dTAL?4wwYouJw{A$dQFlzJv4b~)JHT6x33d8mpb0`S6!&}y6#+Ke%w z-Ix&e8fS!7J9s0w1H5HM=rEra_L)B-beiuK_M2ygj$H6O@B;9{S;1)*g#+d}q02lk z95kO3oObX=a0htHIpI;Jtx9T(wb~5E=2?m9fs!&yZh3JqSyIMj+6qg-4Q0VcRTWiY zg(Wy=u~;rv><-TD-i;U)zFf>$H=&NPo@eFpuU1q9=PD|caik{U%Y}^f&6Obbsl%5M z)(V{*i$#vQC60M?O?GEKG~1JTVYWB((rjPm6SE?FctK>3EC{M6XY*7~&CY9|wuq`{ zW?d}j)&2~-Cdn=?FcV|_`EM^5nr*9m0K7Pb4}$AbxC=awxa#>?hSx8uz9`K_F**OD zu=pj_{37iWi}R(|EDx$mtAJHN3u{`oAQ9XJ+y$%xRspMl)j$hdS+?91!uZN^)y*MR z1G^g7?FQ}!);BMe<;Eli=fiumNa;o(+0-pdHu%YydU_8-X_1 z*?b|4uPir=%j?-B_BFv@(_^aAW?(b03HD8}cK{v0W?(b01=s>?g1@GdA&jpqH%CLP z6?Uz#YX`OiTcOtqy*3~v-tEA4;9lTfU@Poe;~|W%EVrMM*Od}`JK(3|PS`nN*9GhXI-%!;-T~kNU>C3pco29H z=!Bi~fe^-5mb=c&>)9;!=R9>e7rR3wL}EXT7&w5~OEFx!C+H!sW2o1D^V%@vB5~ZG zMoAAbdBZ(pH^!S7^OHWtMdBzBf~1d_&Gv$KU>DLjxJcZ|*c6Q;h9^i4q;(ACdX37x zmF2wqdI4jnX&ocEF(b`Cq@Ob~y(GVv<{QypGSWOlwa*#ZZb>fEJR`MVGm`(IwpWa7 zuOy$Rc}Ch^HPSqT8qD*ZhZnb3P;P_$E6%>rfBFp5;@(d49oO*r26b~rn&-HV&oj8| zIV1T$zKi=Gbi86DzsDWi@38`}u^(|^D5Fht<`ww2toa|fcg_Ea0`C8qk^4Vk8KfEA}>X!tM{uRNi|E}QEe@_@~1K$h2 z4}Aak1i$|KLO}mF!kGRC!npp2g8u;cLGVN1hkqypjXx45jIRnI<39)|jXxF!rS%=r z(C4|mWNm$}dA`128}#==e;@Ss|5ykcFAI~#e-a|bp9)dq8$$R1_(AYP;D_H3rp#{& zcbnf5V&kG`1d7)~ zh}RTThq0%I5ett4j{}FXhla6-?f~8aJPte#yc2jQa2R`P`0F8zuPh(`^AH<>-3aVX z08ao%pf>`&yMT8APXJE<-9R^R1a>3e3SoR@`NX2UH&gPgQTQ1}Ec61sz){$Z!p;Nq z0KGsj&X2J}PE54`{|02~930mp&kKtJsKe;dO1 z%JSIX%j?-B&kMp|5V0@>3;~0%55j%|H~|a+L%@^3lfWSS1z!tcd}TTGk0BO@T^M!| zU<4S3UKo0lz)4^P7y(9sQD7K$;h%&szOo$oXL(&IdFB-SOd%G=fHB|{?51FMH}Gy? z3>X8(fpOpz{7n5Ugz=T-*uREY0(J@5%>ZYB3FsxDHw~Ny&H!hCr+}w`3D_n6J%sU< z<(dDG*RxrkOR;oVj;Z*fi}Fg!Yo(Y<`7g!LQM$MCJXeaTe!4gEd{>I8Dfuy@7#gH| zAJ2oOm>Q;g7SD&Jm^wxGAf6XXF*QN=kj?Ys^q88SFDo&X=1a|+m6%HNq~^~`OijQyweU}nzy7{! zVajKu+7|h>RHLTkXA<8mKcYNHsfl41lWSE2t})uvCPL&n(jG@XL)RYFq|#nKz-w3M zEvhoo@6L@GNWWV@XCVFVT3#!5x5>5Y5nd~H-|>op^t&`%ze~sUyLNH?E{9y39^(34 zw{iWR0IYQJYqyHmydjnhTx>UI1P=CfABu?L~Z)^Q_qPKyk4p zXS+7|0G=m{OCi&D3*_DO5QqNX-H+XRhcY@~I!>y0 z@0ROb;8yEh>7JXi@1^>8TCRVATdjYkdiNf=-UVJ$y?dWr?*g}6?;^LzG2nwO0-6}> zMVz4T%2*jV>Npp1#3NP-j(To@IN}nk14kV%#nIlxiJQT@}4I5k3uI7R+kxjWa@ zUHFbEB%4My_ks_inevUgvH+)w^oZG%!Bm=uF*3rAN3Xa?bC6;eY?7hLT_Zzu>e|?v+3;Uxd zBbV1bQhmk!cl)`2^4GtQ`|Cf-{q^4kf4Ymp5#1%BPxrWRlkN$j*ACtY?f`FjLU8Gy z6prej68iN|3peYZ5nOigMsNps%QM1&@mb*(<8#8G@w39M#?K4g(mINGeN#;J4PS>d z$k#DohkhgU9nf$2yf9>bUbxNtMd6tFOTz8u7lk1^cq6z2yyZnHQUUGU?=9`6VC16{Cl!R{#VD6k*c z54;(8GtdP;u0IZ8d}X=+)3U!!?zIE(H-J4p2pj|sz(w*qek4#3~Q zXF?cXSswfX_nUgJ9)h1C?D1p3W56NU4Z-d<;BCNTz+=GMfwu#P;AiNC5XM)Qk9{S? zq`DA$T&fF!*Hjl`k4tqSaLaWe?e$)Hue&G)^PEH4>$m|axremZhvYg#+3To3lw3sG z>!?HO$R2y0^7_=AB()x(y`G+)>|9P(7REs`R zE&52c=p)r`(pRcSAE_RFqd{xq1U>rn|LXd2C+T6lYe^nYVqD7g(QA@@bjvzDN$89sJX0NzRH=ef^jxJ<^bonjmr*kI z%@sq2S^&>;sU`pxQcVCX7CB~63|49ZXXe~&SLV6djlCzOMv!|?W$C@AkMiCV)c{4* zd*Xb6YK9a(NHs$WcY$XRNAF3*Yv8@7bJFhvP|fhT+Nv&Ce#F59aPUJ^ay%R_6!VZI@TAw10_@5v|{HOU` zX}u)mIj5x>MbCRLuG%@JrjdF{t~TC-aUFS1scEEMlIsrMi*Zf7Vq~3Ctt0i4T=$?S zgL(RtUXoM;VE@Uzq#U`I)TiU~_3h%lo<0Ykr|%H&_4M7w{r3fUucvRC`|VHfB}ugu z_fNef#ect%`|q#k{`>cF|NTd~|NgtU|NcqtzyCh?f3IGWR8#SJsFy_ZY^X)_s6F(k zJ@lwO^c!oDQ6qXsM)Z!1=p7l+OR|GEf;+%l&`UC-cVtHI$c)~R8NDPscq6z2yam0a zO7w;*(Hp8nZ>Tb@m!zUvi?5G*NwmJ89r}&XcR;@dy`*aNj;hf+sz&dq8oeYtcq6z2 zyam0aTJ(-;(L1U|@2D2NBs+K`xC6Wey`*~dj_T1nsz>jr9=#+xcq6z2yk(PK(sk4f z>EBCAy`}4^A?SC2HmS9w9us1t)MEl}wZ|lVuTr`eBc+}bVx!b^0&cbEB=wdM8>QY7 z@S1u{h>=on3AokXlGICjOztHCf9Jg9Ne_E{3F|w6-N-iz+hBk?j@3%%I z|C{C5>v}_*#7Of0e!d?k|L>K9uj>tM5+i@#->0wZ4Q&!5H~encRlaY(-W-0lH-s8B zgx-+IMAVq~qemou`2WNA;KOU+!)rjg29z_KCOS}w2l903Q7R*jOh}I($>V~|Ps{jU z?ZE9)rw6yvI_BMM{quP`f4!Ywv68F%|U5qErgYAPB_K*M*^Gd>X+$2bnQ1-#Sa?up1))Q!_* zZVwKoMdNO)b8I3Ui0_HVeCt*&ku5BD?H%0o8NS=V18=&1{V@MLv)O0U<(kAu?lZ1&4E`K_|tU`$cQQ=~|EWHW7=oL@KD%*GcWY0-~Iq-WWzE=~Va zvV4FJwxvA2EIrR=18IK#QkFlyGesFsmu0yt&Ckzd`O|6ozmetNZ@$o~G}T<=TDg{di<~@bG$hLY9}N>Ca2@Unf7hUY&hY zmXD^%euIt!Qk9J!zJmOI z3Sg=nCf>RJ6$_AWcHXiX^3Bd$21w2}p0{MyNHh^(RvaqDaaCp=k4#&Go_LVSKS`d5 zF>5U9_ar>bnh2auNZND+72-fR7DeFk`w@1S)fCl3w{4wLqYZq$Qmf7Gwx=mIY*cD=LzpsMqpgX#C&t`3&>wKy zTN|43E1m1do0a8MrX;d!BVv@rPKMV7fPB;x(?E0Tm6K$w91q3eS+KFeMkR+qel*|kGt2( z=ypaasC3%_rvH7Bw<*r2JkqFSOSzL`Z<6y_1xICD%8zNT9GREOh~G_P|D8-m6?;*( zSFlFbQ^pm0SuCY-*Xu6; literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co new file mode 100755 index 0000000000000000000000000000000000000000..47c06f5efd8faf3699ca871a6cec66d2aedc86f2 GIT binary patch literal 11816 zcmeHNeNU57ZtA-?G23dsmbxe~V1 zwfLHlIPo&_D`-4`rY62Jiq6C_1=4*_qE=8>XV~)7|O~R6+4rb8D9I-nko61+v@ zIu_|zUkRuA-5W8ZQKH4`$l7DN45tQUMmD0#$oiORdIKCEr9Hh}eOwOxV=k(U3#Vmv zZ$v%5xPL6-GtepRY2hCgSX=Bbz&gAMzNcLz9W_cS5Ss_#fS1y0rIeV=e&JnGdx;NCWh5bzo=xROp5;T*RQ9Qb7x_>$Dv8;wkj;c5;#a1U>2N6d z9G{NGvSN-r%V!gx6)l(e0((XXl@LU=Y76Q%1Uu-s^Znj1|}f1?Sw8%@xD zqY2zMnjmna32?jI*nP17x(OcE@416&sImB-yBK~*Q0~CBsImVk>g)WJl%CB9S#fGc zOyz`F;-nZA*f;p|QSzGdOyKTcbBB_Nsg&gpK4fQt@#pzuC^a=3ni4O5IwPiTALVX; zWz8fU7o}_>ow{Hkuk$mZlR_*R$_XdZ(zMnA)}#?nQ7SYezO!36;_019i-I?sLp~vz zK9M?j{BA-0cQB@)Te#aTBog1?Gg9JsC?~G?l58fCOsWqzd5+IM zG!srI-{H5=B0n8{ou5r5Vrgkc`3l1NcPRTBKbsY^Li8bgkwip5|0C0vafQ~p@!fB| zqO#(Cyu_af9T#I#8rOgg=EzsIs+7a)!a{sqH^~>{mHfciA87Q#t-NI~3^_2A^yp78i;LLOhWXBD0C8;O&k1Bc5m=7>)&k zfnIU5?tCO0Kb6Q0guFgauP5Sh`vQHTus7K6?~i)>{NZpU5Q_Bn_4#66kEcKA+brX# zY8=;mBij7CSttkNlVvZF1q`4*-^yj?6__!wX62H}KqW=Fd8Hm^8OSle(Y(^mI8d3A zP3DyVLhF`HNR14haWD+hs(5(6%!yj*7T&n*)}+o_c&mZ{0G0v9=JvR^2W-&`Ux zAqDc(lDWcA{wDd-5kArgIvMbLR}^wsP{wTe8^zYr~at0P``; zMqly@zMg^k45ooO=69ZMI@ff1$D(t`m*l4T9gBZjK~{EXxtq>#i`;g)Tt*&2uG7m+ zk8_L8$C?+pW=^i=IQb^IdA{lKCfJ;dO-`p=<7|;_^3M4kzwBJ3PM5q*Zbe4tV!g9T zcE~&C&l!wu^G2f!;}l;pTH-`L_@i>dl`}LF&R8(gxQw<7Mq_*h$m}MYE$m1tX{Wxfa!Lba7`~Z7$ocmbmTa+IWFn z!Zk6FI&yL87V`YkS(EdEbG~!YjO%c*%0T98YFuVpRh-luylOU6*~0AGVXxorve;`B zyTw9f%IpoWH`TkSW4mIfl*(1i&cW_#a8=z}uh^@qsI0eZLA#ZDyJwt>PPucD)J$Bh zAZO(fhQ_lpKc_-Z&!I`y~-;a&&n(0$^k1fxWY8M;#3er-+ zm3^?p#(tx}*2V6%N|ZU~Kd)(fzFW5bmStj`OU z62Ha5RaRQ!Bg}7vpIQ{Xk?AeaQ_2~)S>m$pHm5)7`ckJ4*%f47i0f{GxMl!C-VbasSD9<`xWQZozDJ8A)ya zQ9o*Hr5V@#IYrw%2Iwyk2XkalM@!t~VP$ zj{NJ}T$!Q41#AblGj209bTJNeGu~`y@G}k!Fzzul3^ER6dROIqBhzC-vpf2lv+G@R%wpmB%F6#m6>Hyyj-VNS!fVSHX(hl1p+G!i3yKRSQdo%b>@K*3$ zhiR8>oOaueP~J8{du&H(R|oiR@NV#)qttDmq#k>idhJo_v&X2r8GI*rEBLM$_1ojL z*M5xl*^{*2o}&H^@ZI3u;5{iCaAauEA<+Rxj_z?Br-5eho#3tDyN=UA$4NTm_$(cE ze2(sQ%+bLP@ZI3u;5~En9KmyDp*1yFKrE1I?&0cMm+592FI3m6HcOq0J=f<9b!~E8 zLwvrW!F9f_F+SgjOHPtkteEx~RufYFwB~=gt}Z@bSEu`LWBzS~?3=Gf%`qmgprmXY zQPC=^c*ly073`iIDW9LKE^nH#dAaBC4i0Deai-w$5FxZ&wJg6}UIo>AIjwXo?t zU4Fr^u*{wlZse7>xzmRF7GMjIBmO+sPe?Pc8Q21B0qz9u1af3G&)qJ;znX9Pltf(6 zxu9zUwgFv;b0Mx3*a~a|wgGnmcL80{xqe=Pe>LCsY3W0?ryY6Pord~OU?;F0x_0O~ zfE~b2U?*@la5u0WdD`!m;9t#m-YbzV=(?cefjqDaab1Y(26h8^AP?*T_5iz}>$+cp ze>KkwT7NdFeQxA)KVqo&0=+;t^ls=qKo8Ih^a6cAAJC0_?uZ2cYTi2~5kGW(==y+t zKtJOAi0cLR0{eh{z>u-7s(%IEc7G#0>$5fWyFH;9lTf;2?B^zaYWCnjb!; z^=Gr%Z~5XC6Yf`;A)b;k)Ic|CFRS6ooZx^R3(j%a1F*}CcM<$Q0&%FxADS~d;m5*s zlEuR>GmiNz%HmP8$sTYlSmVqOE;Bxer;_&IYa z*vZ=WJ0vG@DE5=Ay+OykleOdTxZot6iv4q}oxzT0oGiaT_^OlaQS35nXE1o#$=dOn z(9SoXShm-(H3^-mSRYW_Z_nt#}-=I`rP^AC5c`TK^{{KG@Y zZ@ozGvHl)CY<+>=YkiT9D(!eor`|+6Ppr3d;B#s_V;;o&5Z{aV{uk-E^<{dW^;LSr z`up^L>uYp82)+k=2z>8rbi(#JeZcld^r-C(D%h6jga_OQ-V5HpL?>-4G-SI>!?thH zi0#{SG6=p0dXnsi*DGmgKaDaVg!+VL(;dcb|)z2N=t z(v0IOebDh^DmngvW*tAFnIQNc@FDQMKcTE%A4I)oHFXTn)EH{vVc=ol7@naqJVW;Y z?*SeL9tPeEycalzXKL({1pjLO@JkXIhi)9YBfulTam0-y?mpmsz$3sT!25ys1IM8o z|C$868x+AqstPR zgl-bLFfa_9MBF6eLckC(3=9J!zzA>>y2)=!@UP~>f2{RqlXYGc`J$+WF<=ZBg+2;4xqvadE^YfC=C+;4$Dda2gngF8;Oz z|7!l&Uuyj-T4yGaCy82^0;YgT=#tRQ0B3+HU<#N9rh!T1N&dA2|7t$dFAa4 zbP`YkW)PP_+=IXeftUsI5-)}PJRxvZ9sX*G46&B5xrR;j6M z{mW`7eu&Z6xk^o)WOJjs-c@R9(fXLM8X9GDpSliKYHFO#S?YRNsi{de2dV2~rKV=s z9J2ZPxU{C0u9x+iTDor5YbtA}c>S!`)Y5hIwQ4GB??%?w*Q=>4|NqAN`+7C?|9gFY zy_))ey?)ng>glE?`Or`2my`cFzbq3O|A#$|-*n`U|2`grkMDtx?}1J40e#PyV;j0S z@PBmWU_WEA13$ki$M-cq)%Zyb|Luc9nw`p;->BIyYFwl1fX!kq!gjYfZJ&kndU-eR z#y$Yo9?#`6*@2!OkoatvkECaM4uzs(I;HG_rDrm;Ik6|1JU-K%ka&KUremtJ!PVCRNdKNI9<{RVDVYX16pJ1>^LDX8*5SH`>o=_V$wa zr!;$GiTwr5{zi%YP0b!CvFA1W4|XdjYaf57*{_tuzpvRFo7X~>6O%H~6{IxJEt=g` z62D8cUnyxfsMy~xENo0qQks3F#QtT?j-LQF9Dk$PHyA-~Wxpen_Y0-=@oU(B=^FO$ zT*JN%n@pSK|Kv674`0Ln>({Vfx`w@v6&QnsEy6b(%ZT4>KlYzt-)uj&0eLptk0pF6 zof8R^i(orEk;2Xoc65`g{EqcL2c{^fDomO7QI39^?U8KCCl8s2ATqM3FHJ&~80*KOA3>JF@WXpO< zjLa%;%xqbSH_sAK=P>OjlZ3rFl9^RrxmmkWN^Ht~P-5Ha72Z-Wuk%`lu#udg?CoYVr@dXu$<3a=dcI`k&6 ze=ii$x?W}pap;Jj%9TU+>w3+sEQ?rf-LJ1tbu88E{nwGLm)Mk~>-F`pj{167Z%>() zbpqUlKW5YQcv(O>)=;HupCfB$>IqwyF}=QCUt8zt@4?(y-Q zpQW)OxJqp-`lITPYkGayn3f&7UR}?rHukBvrQ<2s^lQH!Maav|Dx1^>mW?+v>Eo~8 o|7T#+$6i&apEa#2{zkD8=X$%k`3pGNO#i&5->cu~nxItw-vr{67XSbN literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co deleted file mode 100755 index 126068d82e6c95ffa1890e0d3d3b550bc0c77faf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19384 zcmeHP4Ro7VdA^opN4ox#)yPP)q)M^kpVU?)+wxx$S8-g&F^cQDZkh%+vSi7Y6IpU4 z*-4?eAC5_ASK>63(v=Rn^4IZGC~O@Jj_nl=<7|x0Wru~qVLgm7wqXY2u){eU)t-Cr z_t~=IG>SkNKHu}+_kHixr|-Vs`|8c7hwljVstOAH8X{h5@+v8ij~V#o?{M&?uIRnE)MKW8HB zyu8tMDxcL|uw(OW_r>O7f9-6+Ti~1Zkuji(>~9x;5v9h#U<1;>&ziOlFn+@aJ&`uz(pxI>XhDw<9Q)6=1`=tn2w$#5wC zF?Vu&JRQxDkGRv5Z;fg{;hqh}XQIK=lZnWugi0_xGd>3&emsRBR?y1n(;CMWg37$!&PRk2mQ);jY5}~Q+x9fu^yuPVqG}x5Rz+W(u zJd+rly*tSM8yx583Eu4qPELN&J)N4I4P~P1YutDX}8=lX7I`VnAgln;1(p0@xoSO=IK4_}Pw@CMEDF zr?x$M(USGJ~=jjNw_?ntp+r2(-hu7E9)ZytE8xKW79g%Rj zrET+y1smK1WJSLJZKdjj&|2_mvVfDI-0!y1`%TKEf}#~ZpQ>nutw{p%x1cUsF)Y!%n3lByQ7RaHW#>itsHeAS|*$|9L8m=*J?ujTJ`e4MMQI{yTGsu!$_)~uD! zW0kCnS*umdV|_-l&RZ9&tW^@9ORBnPsd^ge)cmpPD#nN%*7&+Fe)aZy{Lm2{GBUVl-4 zu~J_ty-9l0{Gv)zGq2IO@cG16G};)E2EVH&C2EEmQliOfXiUOfSxrf71<2%uq@<8x z5h*FkYRY14gJw1Qm>tGKE%wxF^&7A)d8tF;O}t6k`69gYi*U9R$Kd#t?59?Q~Q zVtkiNjMEy&g_=5-q23iURJmf=k~%S7w?~YZG&n9axLwA*dt%0_J+W+=Ta3FsV!W)$ zaiOWjW%7AqrYcV?t8WqGEv;f)-|o23-s!5^*BYy;YK>(LonpLmzZf@kIWBbhUG{_f zWA>{3v8=`~#{J!5T+`#Y(9`R39`24gt1PiBdCwAycst3ZB_nx!>CwWH%O&%MivGRWi9B@*cOVa&H5dS5|5y<$V?5eeK}ytLSpkg`$f^lHnrJkGxt;79>AI z&4R>uxmNJ<62T36!3{eFH<|=DS_L;b1UI<^uc{NgszGqOM{s+K;HB+?m+oJ{+Mas5 zLQGl`eK|7L&5#qJr0!`1N^}j>ScYg87?+gslDzQ} zF}|XVm*tI@iE&aW&#%uL*UR%O~A)%66b`)CPE%vezdUg za+aK5A~?xea$%`xhf2Mp&@`tjw9J(}Rkm36XwmYDdSO|)Mzpv_bht*cLNe?ChnV-I$$5Lf$<8J#ltwTg>k3K(#|+= zKjU7N#m_j9+4tx!TbX@8_nej4_n1FxW%dE{%T{LJQ~!#U*$3*sZ)Ns99sg-%_JIzS zjoEh>3j6*-Vc%^O_Wed--@RMd_wN?=-9BO8?-TaDrNVxoRM_|0g#Ca`*!MOH`+-Jb z-+MsV4;+BKv5)RH_R|_;fYurZs7nW43|21biw9X6O4DJJO zJ4PGK$7!Sa1a+H7=w9%=N#Ry11|I}H-k{gxe!4&^ z9p@@7E?t!&maVi1ncnUyz10$1RM~4JyDK*Da=9+q_r&J+>_Oa$UojBU!_Y`b@3Zpw zXYBUayxp#hBPEMpDJP_Vz7p|J6TgB`SMC>RkOUf61QxSvvPb>sT(A1#T%Y>#T)+Cs zIf*>6B$4+nQPop(#j2<07WB`!B-IDz0z~`DK!LX~OBR=igOGupKUyun)w9kA!3{Zl z2;7*%1K`DstDc`@d55I>IOba9ak~5xUH+tMVVUg{m;c3Yx*k!PYJfFB7in2_l@hWW zxEojltO3>nYk@AZw(5Gb9ph`OHFw%c9qj60w+FZfSO>j2=+y)3fqQ^^fVTl}1J=Q= z?ruBA*H-u3YriV?G{8^8yHzGP&<$*WT?6bIfsH^n&<)%R+zV`gpN6;CF}}9y4%&$a zb{^O@0h@px=y{;$1$u!^z$Rcbuo>uqooCFB@wL^a33)wR#J(2zYk6E{Y6Z3eTVUS; zdmqpTYz4Lg+kkDr7Wiv9ZO8c9YHQL?+F{oYyG~#yupN5s(CYwp06T%5zWt#&kH~F{}&Ql>05wzV>art+Q+$Bf6B0%|C3Ow-Ga!Kg8x6 zF<-W^d4}tsvynYqF0pw=>OX5^{)anWwvl~YzQE=g>3GG)<{2u)JYRlnxzNt?ohNWl zpZngx*@wwa+J_&2z#i<{8@koQ?TEv0L~b^1W-(#g=+W_Li za6w1m!!N_XYs3HGLmU22mJ0vJY{LIZoA7_EQTRXEDEuEgApD;^0RP6%(EE&^r6-Jk zOy6RBi4O944i-NA=a^^D#ymrknCEy0^!Gvk0Q7HviH;aQPwzMW89izI5)B%c=*U6v zL*RFSA6}xP<`o(;zf8mCuh22`SLtX6_&)Fh;J1I3M$BKMQS)EYar4*dg!vmZauECw z_#NPfzd>WRzonD5SLi9*-_g^yZ_y#Xz9WU~^DL~~SYKd4tS{C9{e93s0R7v)MdP+B zbjtP*G-3NLP1?Rk;|IYHf!_gs_PKcv$g;QPQ2fZzT@nsNS! z&N%;#&N_cg&p3ZVGY7#Bf!_gs_$QRb>tV!e7E_O7PaQ`rJOMlbJdQnd92eVt!25tF zfG2=&0lo!z9DC~c=j<3?TRriob}|CH5!jsso&=6SZv=Yx1Mdf(1fB#2fkEI1>_)z5 z$N1Xn$z^$O=HywU@H2{77zT!cqp%x=T?iNghJj(=7;p?Y3O}P?wqty4HT)NL5`kR= zcH_WtU<7&*=tY51;5cv`I02jhMqn5DD?7&5R>%KFUe6YJUJU+Xh=r$sr+_io$6!AR zoCKZ%o&uf*o(9I?FZNA4#@AL){k@&UVHby80+;~Cp%;hV6mSZd049J*U=kRIUHm(C zjIXUG{!w06PM$dpKhubXDPRgX4ZCUBJpg}G+pzzp;<(3=6y0B3=-z%#%zzzpm%|8B?l+Uo3o$m`iE&t z{FlYhQFdq`FZrUVk(>OX7cOn#Z>11*O7l;FQ)#s^YiP) z)ZcXeuEf-Ll$T3GznbUY+m$|5vRHyz_&dm7d2WotQcE5~E&OiOvlo{{Ev)CYu%6e# zmu{dIKBv#>^*Och2Iq3Q7H0VjuWi})b-Xs-(BnL?!8h3{H8Jc`a;<7*dkguHgCX)9 zzQ>WzuxpRiqp5m77l-1oAT>30_j z{cfYs@7^u+yM1zPdWX>OzDMZymJ0n|o6zrV6#BgfwDaX-D) z*hQ^6@M7>%@bWI|F#D;~+)a0xduX+}mpXLd#o(pj<-OEpLv3n9ZE8bpYTKe#b)q(P zqBeD+Hg%#_)qxj-mx7n0R;@;DT8-MY8ntQl7PV?EYSUWOrnRU|Yf-D}z>C34!OO?x zT2ZgRgpYE87#!ye2AB30ee4{LvxZ78zd$TC3l_Vp_}0qTs~M54@|v-Aqh{PxQzCoi zPnW;vbwTs9_wIrCH^@=Ijbju22#jT$Kq~_`qW&jIx*L-PR(_y zXXo~--#XW&odiJU{10p3+@VfI9MVURQP-P*+xyb>0JR z$l-o)9pkFU;FHyn7lDuSIkFx=7Ks(S5 zycKvW(29GWb)OyMYpeF#<$IrhW4}1y$1$Zc?E>xsI$-C3ofGH;?gH)tRs*Yn4)}2# zvSWO0b=P6pU(bd=W&iPd_W`-y1#Y+A<@em2eb4LP8M*!iZnysB_3ndmy$igddiU*e zy$jrSy^GvJYsCj!0dx@3hd9B$DCN`%*UMufGufSpCE6tINE;>K=WV7*Jm^zx_txw|{rHjrr^M34i@}gl)`E|2@Lb zKOtD*O%H4}Zo>^oa2??KeI_ z-)wx6_IbgZ!F}LuPtt(-DSFiWG#xNML+>(2mQ8>(qZTG^d9HO=`rUg>AlVu=&%>O8Qcfn z_5wX#je4yb^;$LRwd$AXaW8l?xDUJy^;<3KwOZ6`wW!x>QNMY?o56kHZA+BJjb6kJ zw%7ZxC;PC+`+@zyKJ39h?7<_zBfx%OKk&`KHv{{yC;Q%G$N1W6|NG><*|)Jb1Mm~T z9v=V>00XcK!0ssUC~yEc0K5};Colj%f%n@nzP38>LD}CH_u4`D8^j(T0uBKOVLu4_ zyMT8Ahk!%CyMcED2jOqtqy%m_+9`08u#j9_!-6?KL$Jo9ERO6?Ct^H13U&i z2D}$|FK`%shCgA)_}c2RPumHv3$e#}T?o9Px)6Ju*M-1s*M)4a_sM%bz+$k-Irv`3 z4M@p7_+B5D>kMVDqyA8G5x&<^hcqyI>~)sc=jJ52^#I%J`8kTR*O|ZETt(UI%wKNK za;?42{N7A1bG^OJ*7uvpb*{J9uc{0Fk8>equfLMix? zBh{miRF6JV{T6+tM)Z*y(MM`TAE^<2B^`J%cqw@K><#4lsv1cp`aYLbqE95}=R8kG zpQ!6udHge*$8nYOK2rCJJf6ik&-Kx3l6~~ZI=xxwjN{9L#C{M>bWPnP`N(C*3Wq_ar#$ zJ(cHh)_XGKaMpV&U>vrNZ>(n6h(o+)c&A)50B@*fctoxlfHzb#y!*y`WWSbP*n8z( z7;xLYu)Lau*Bk|?LFAf+*B&`F%P8VsZq34aF1a;JjB(^Kqzm~GtKJz$?LrQNqgtP4 z9PvNH5b>YQ7s%@+AYZSBSy#(s#m70e4k^&u~2NO8*oKn;9UQ*yb(TfR8zHB31 zyw>5pq`-sd$zYy-rI*BOfJfJRNmTA7^&7=}{kui4r{5>$>Ayqtdiw7X{`;e%*V8{E z{0`*zl6Wm8{Igz?;(x#<{0}q={{shv|AC{z|G@pi|G<>+Kk#<=f4yE3uc^d5te3>* zxvCa1qxLYP_AsONFkfAZjM~sUvY~fmL+{9jUXmBQ8QcfnhF+2ry(1@jM^5yPoaiNa z!JEN-;BDw7Riigljowf-dPCKDy(AT@wZ!^ZFNv)$=7oMU^nK88LocZoy`x(6j%v|6 zszoo!3*HRw18+kwsUE$fdi0L!(L1U~FUbqu4DJJOLocZjy`x6-jvCQBYD6!|3*HRw z18>`+mvj>~Lw>HrdrLP{L$I7}i(HHMm=GIzj|sTl9uxmwB_~(oJtxFQ-g5$Ox97xr zONfoUw*T2NAIK_!P)Vq`mFV{R_ZdqZ2q z$TwOeUMt7m)En9&Ml%0zrHNdX{<(bth@&WR6{xZpk$0+#gUF?BVyr=;D z+YIsg#cSClJ8GB5HL`qE<}%#?Jgn*%V~^)Kj3bpCYix7FgQKa{<85)Yh1)%z zkhi0ywY90KBi!E7))8uJk3{h}RZDYZthsrtITY>)`|3$xGJ)r0I~{C6fk2PE3`gDG zcrq~&O$DbznHYNnFqlpTGci0On3|qS2ggE*V0vbHI+@BqW9)QjVlpv-aXc9q4bMyj zCllkzARfsJhVX1(G9AP^$0y^_^xkx8Y}3j)*~W4=&fu0u_-+HQjpynVCGmOgAQ6iq z5m?1x2YJlAS$>-=pU%_ol;uz5$&bqNC-UU?%km%C_`&-BQnLI?o_t=GYtdH7eZ5bX z|4*L&$7H$n)?95uz9h>{d49ew%LmcVvi!T=&HBe>`B(D%oR;N3$kTtDEZ28!_H#*=$5@{u=k+03 zZpzdD9GCwr`>q^acKAbCKAI;tv*#Gd&$5m@`CYO+PfXr@4f)ruA^-1d$lKJ+^Ut#R z{%5Ws|I{_)|9TDiudX4l-~sRId7ol(m6k}TEplJGA>Zox*E=EK>iO4)n0$-pUx_=B z%tVPBPx#`=Vd9=h%(!Erbd1RVXYx#nxKqhUC=(*?O!RDqYqRHpA&$mVNd&M+1mTOg z!|60}i=Pp2kEMl$OxY8|cuIP5YGw*Lh{){eY{DK_&V<6i7^V?N0!i@HWCj+K2@;$f z58`JCqCu~>tt03m!EiX3NCtUOZEE(4@kB%!9>ZhDlL`Fbz<4qqPo7~5+icN7g3+_l zv8hn{bTG>7+FHCpxY^{k^=hS#U{jt_`&CLlZ?Ji@QVSNm{s!9GJV8vKJ5vjso{1s0 z2g6a^1cFT-PgBsR$k_T6$tEj!+2qxm@3u{Gk>l2TwO=OMkpm?0H z9|M@4;lH4;ep%$n_~)J{+umoojTN4L{fF&?J@*Y9JbL6%ZZN)yV|M#vDK5Vq0rP*q z)u~+H?1(o_hnUALn3IW#QkI5AK^*1OlRKi&9%b!bd zycd7WUa4P{-;+>&muUU>hZMWK1^8rJep|<}a-UXy_lNZs*-NQ^l=G(-Vmv=2J`=_M zr}BMX`JE!YG9c^cO^<(@lV^|b5ZOzy&)3iAL(t8a<=V63+*iJC2!r`DrmxJOYs-%F zb!TOnCr?qa@5nPysDcl}ZvC^*@!^s4Ppy2RvGdKYMdkb}*Z&2`@=h`L(%#L1{Jwzo Z|10wNmHn!0|A)$jE*s?A|1Ttp@1g(z diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co new file mode 100755 index 0000000000000000000000000000000000000000..a7c1acb6f3c05c87152c24289c7c2c351d7be323 GIT binary patch literal 11448 zcmeHNZBSg-c|P3zU^!wDi?od0gJq1cjAg9LesXtNc6W9c zNlqPfjrA4R!HSx;w#P|5u0PZ|v17+^ohAd+&ZM0-O}%y}wcCv2=}c$*hZ!fGcKRpY zKIeYGvOuqYLjZmgh9V=DNbm#&>2h_u{5#?%? z>qrW8tgrItH2;B(m{KW`H7wSBtm!_!Qw`8j4__NItvwEoPt%^>u0Aef|DFx8arw0U z)Z4uaI@aD+57riY`B;ZH!H<{|hmIO0rLg5dc;M&nIQ+%Yy9s&tCI4(l%J_wua5@o= zXTJUcsd`cHPp8siFgcgV4nHUO)6?&=$FHm7lqUQ_*__=i6%ndQA5PQTqF`s-z0(MRufd-YJ!?uO~Bu3f|grN;JVcW zzFSRz>2hoH;OI>gJgm>TQ8m<9IOnbf9uSooxE3}3+oFCTOsA7`DKQhCo((6mVkCMt z91__(_=gelx^him_OH49@#u8I^8M@VOfd21LfoI2p7T$KuicpnCk~D9hhAATiKoNq zOf;Fe%8@sOS^rrv68C4tGs*Of)&bU}5m!;dKO263pLo*UJ(~=Ro=g_`#8C1~V)XQJ zQT-c@DCiQ8yToYphe9eHJ?+ni*Lu1BCdi9IX6;IDenlmjR5Tt}FK+UJka=J>kc_`C zY@tPACiI3dmxxA^=~?A1i0kj6vZ59AVbx3~nVnFg-#7HS%}{^U;+$6t>=kAky8kQHKYH@q z&TY@u{Dzs?i}i{540vsNY+>-p7Gm*15mA()DKR(~4TCFpxf2u>+uIXeZAh^kf+BR2n2opV0TYXSH$CX_x5#dmT*KhPH4Vi z?f$!2D7%HYd*K~=<-*M+zMRe5z~&VdGf>(kqOC@Czs5noPm>f zJ+^D1ZqZ^SD|Lur@w+bC7Hw5@xrAI;Cgh>zU6*Z(vdt!!+v;SiT)(jEa`~cMUM^SQ zwOzI@RQ_SbqNSojE|a$}R4!T;t1TA!Ho0u!G0xb$U^F_>XKBS~k%&C{3j?V$Ff@`n zW8O$5bcN)NMrj4e>?TrYV#tv?F17v9<-EDPIiJsye0fDBTX;QguH-LORyj*|R7mH! zD*k+xdY-rJRO5D5t8tbZ{!&e?bK9lW#1Ql0QPbrmLn)rbUxny+|r2KP)8|Yvd!?A~d5c-`8<^h=zs15E%FNQR?x#h5 zs`_Q!Z!Gc~jl98xdBglDN~}+8?qZfN>tj}dG24zY%acFG7+OfpAD9-;b4I5L^Gzlf zmd^MO<90KY8+)GTs*Q=u#Q^8nlZ`WEgW3W6SXuOnH+@CPs}r*STV6 z7cPryS+^^^eVrG^87d7ju}ywx*lCdSW{efyhZ65vk5{;Nofqmk-@09i8(8OsafTYL zox1gQviN-2c=6;v-sMZUS|_jt*uuDtt8Hf-=wiHqtMxJt^fB({Y6lnxGQG2Gp^oX> z%dXZjz0>hb9n-fvUaez#XX~4FOyA!6P94)b`~Izt>D&8$UB~o|CRN{NQuU2?Ro`Y; z^^HxczO6~sH+HM~wr*A5Y*qDwRn<4wt9qec)i-yjdZ9zrHxH_MVGw%jFm13Nq>a|Y zbdU82E2=&?}+HH@~9($bj+7r~<3ce4#1H3aqeU23EbEIj%BTM%?PE%h4_#W^k@V%$$ zfa5G3bbOf(IUb}39P@Ob6?`9f2YBZ^eT-O|^A=iJSwhHpQo%o5QROu4pi;h~O0`+4 zon^OGNsn<{v&>aX3)R)mD_pI#P>U%;;_q!Eq=KOpx1gsr|93e~THrX{e~|f+{_sKt zY)0k0bXdi0vWnIf6-(HhtT61JA2uw^A2eK?KWunpUM3fpWO8YVa*xh$;}+-7Tdp`| z?&|yzVtMCiiNlm9mzM}n$Wil0xl&ur8owQUTLB*dw-)dt;3bT6PtCJMjm&*ZX^Yi9 z^A)=MZSMRsyH5DwSKj68xtc~`BakQF9FN671F!+u2y6uI0qz0vWHrYhiow5{Yy4u2 zIH7Yw*9>e1IuYkYTobSf*bHn2?gj1zI-zrZDF**)uKCW`b+xAjd0HOgYTAHpz!vCQ zplb!T0^5LXz*bZI$ zeKGh~bAqV#XOr6JLO$08uEqoO0A0|#pmzh^Ko8IZ>;iTHUC8GO#^7Jgd8T8;3!N9b z9$*j9i#RXhx`Exm9$*i!7uXB*Lg$@{!M~d8NoxHnsJ%Yq@qLx6=?C@$ebD)!>jSdI zTtBcMxF5J5=tCY~CI%mfkVI{-~r$P-~e<3zaN8t zH8*rl>(6Gj-}2;pCd^lvA+C~9)Ib+%FRS6oJ;4Dx7JcKe`(T$DZzp7e#i1r&ch6{p zANQ|G77xG7xR;PA7LS@u_Jd>57-D{KnehQaB<4pAk25=v^}}0suwK8963Zv=<7ciE z>&HOZSUqds>j=~nhhopN_68j3de)A&b)lZLDfY9hoq^V?^(?=)@0ohCU$H;P+8OA3 zwVt)(F`=Ee9$ns1!PX$Se&zWOj;78L3-05ry?&G0p2r@jXYKUc)pk5h3-v62f0LTu z)BQ|6%iG_r=IyZ_tUrpnaIB=kbnZ>$uU^l8dv-nlkX6m!Q?KS9s#o*(bg21Fr88t)_E-jdp6++Zj2lwlnHRd>7)o5#Ref9k*Vi zcU%90p0IwK-eY}bJj11NI-&p#3LwMCqT~#6Hg*+ z%8mFg#CIdU_a`)D|63Zi{~e9ke@dtAKck^O@crO};0Jz2CCA$|>Uf979REaT96zU$ z8@vm=8@%`DH12qh&N}{uCLF(@Nyi5?-Uq%Pd=UJ=2b9(8W2o1xrjFv88bvJ}1C9Yl zaSe^)8oCR37jO(X20RWt4jjcbHTq{U_*ZjdFT}_=bmP#S0GtFA#LWU{feBy&m;@$)ap>az8iRi|m-waDpUu{}td`;nuu@aU z**#cY*D5uYt$$e!onZG?b)Bo!RQxESuXmN2TChGQtcFgpd!M=vR%&X9-Lus7uu@Yc zb`Mh5#Y#<$vwO(q>*L~@%KA~bUe;?WYp-zKtk+c5PT~4luc^iB=o{5k*50kGuWwdU zS^ocx_4mzc>i_rp{AM-v|9btd*VKp1W_j#43(GQ*@%{B7d=HjCdwn(rpUr{K=D=^A z1NxpVkKgs!f$#Im!G6kO2Yzx@j<+>G)%aNr|MQc4lAX$$->BKIXGn(etNbel}^UO z!7QIn20MBMWZ*qLF1MHO4ftFxzq`-d)8p~<1$^G#K7X$-6n6Kyyymw7;v+U*+YEW_;gdhF|4} zi|m7%oiDPV(dd4{zl1_ELtDJu2B)9oW-_-r^h z>(9)H;RJRku~#U%HU{;8Zw&Ie+@iZE$a8&Amk48Fb3I-no{>;g#6TF;NA$RQ`|y=g zv$6Jd%LY~NmbP0HbF(7b=-#r3J#MpAQ6QXVk1G5QO63y>i2;8)9Sv`--GWCAlzsXw zby|5a;B+v!b&>M0L?)Q_XM@s~)VM0xqXc&&SH4`*po$ zR+b4Yx9->1YdRL|_5SPlIcQPR*Yx`OP)B`zsJExw26Y15ix;!$?PJ4SIo436YyXGW z&eW4m0MqO1*R^$z_FU%L^Ow4=r~}=amc0jd==V4MxyO`hUR(d_<5#KKhk(qc>p#}+ zkNR_(dYl%n^i@A+cn4k^^!jt1Y+!K|b@w_xwn+@r>Fu-g!lO8!AI@SMi~qFx)0*DJ zLf^7^|4XpxW3MW-|4tR3*kHW=y?!13 R&Gb)b`uB^RdcXCw{{z8THaq|T literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=True.co new file mode 100755 index 0000000000000000000000000000000000000000..96c13a694e9dbdb4b156fa9961025c02af5d33c7 GIT binary patch literal 11816 zcmeHNeNZdtlC&Yb3}Tb8alK-`}0qg_`eJ)N_4nb&Py&e`FQY-c+=XIr%2 zdp|(PxDNHs_D`-4`rY62Jiq6C_1=4*_qE=8>XV~)ab;yA2BIG2Cgp&4ycE{1aH&0 zjzv1wSHf$4*G9}}lxXofvi6uR!>a+Ak&UP_vOZ>-o`d6~w5PYLkISKd%te%O;kC@J zji|>L_m4$<20EoZE&QVbYm5E)S%){l_qB_pqee*uV)Gz8@Dum!`_#ev2zl(K;7lZ* z4GPmyDHTm-zwjQZy(9#uGLjfh&!%!C&j~?kD*N=pOF}Rbk)qkGn9T&k(XUJ;)1hGU zc_AH(WurOroRCd?Hfp{k91kXEqvCWT75SGfP;;{h>Tfmyf3peN zZ#IGJW)t{tHUVyzo4XJ8-!Q>r`aO404K)Y!3Ov zNcu$T;PHD!_20pmf-do1mzYR=L&!*ptIviR5yVAp*}e>BC#5 zl|)pyYY#Rr)RvdC_}lDY8nT7K-%2bl6cNREA|r-p6A>{OaL2q+zq{9k zf4-1=v+jH(n=qBg4Fo-2cdt9_c6t4M!H_4=@9U3v`h1~K*dGk{_VsyV9=E$c;N2|a zsA?S7d?VWYyICj?<5OiXlLZW*KHth^rWKemuqNe_$v`DVxoM>yW*NvczrnQ9&Nxt+ zl8vSnKjXj=ATgO%_5()BxRryzCkV%v5qY`H=$l(6T-&LYCzq*_TY-z5HQFu|$ZswY znUDf`ddXD5mA^&4v;@y`gKS!+WMTQvik{0XmgyXvVeTAfYAYv?zb%_MSR1aC1DKC# zHu{oR@bx(6dUUzz z48Q0&)4a$x^Kv!M%eTnQ^G#=(U~?=sIUI70qeZsLJLh-&ier&Foboog6&W3i^^PXl zF7KE>#~Iq@4F)I1DZXMb$BBIKN9BYsXJ{b2pv*2Z|jz^i_~MfDq;{Ap*K)4HoAZoRcOULcon zO*m3VE-u|hURXM7bX;)EcP^T69WGXJWWJ`xX|h(uNzK8lCKHv-%)TA=`t44$twyn% z%~Yn$-T-@3y_4FvD|Sk$T*d4>?9K*f)$R3)y{d}Jdb?({TdB8u+Og=6I~PgK#MKIN zRvuw!I4d*0&8+fmRVuHpQF--tl~>lQyt47Eyh5J({4xHSrE+p+d7e}|*|=AbmI}V? z!zDKM8~wFTcCS^U%rXCYP20@0T(z@cs#V6X65r!&Er;t=$5-A~8z+W_D-~Fu7tAGo zvzf1~G{;An-vB=~D|!Rdo1v$aH*7Pkx>3glUEGimrG$KnEKa5`}R$#~`#I1|5MGC2*THvfbV zwYAcO>;Al=Z6d$Bw1Yec?N^q_2<}ykHOHTE6!07n^k3o5b*|W1(O_}y>vn~At@Glz z@|sGSSSPNQZ?BXKCQ}@@o2qxM$18f@Ixp69{&l+&H@MD=Dml@#D$A zzQdPs4NhP?u$^%$*U-f{(8YK&*WhCu=x5x`H4HKiWO`@id?V9$RbFUhdZ+!_MyBtw zzt+g~&W_8COyAY2M&}PdWw8gTI?zG%R`DXB);H}`h z?xIf1e%fleo3>d-=`PCw>g)jD4c-mjbAYy657G|nA=+sjqr0t#X?rvHPViRnU59Cx zb)0rvk5IunL3^x6X;%mMZt!mKo}<)do1|`Ahb1qFs~LPJcq{m>81>oWwAXfw z_SurO-bGZTz%J1Ndyek0AE*9i@SWhT;Jc2~LHkKMWdAH3wttT9 zwa?MP4)ERJ-QYcQ^c=x+W~MbYSU@b0YW}h6TBq?=8ZT7Wsy1_-lRekxxVkpEt|30( z(BM2@*BGB~#3d)mD;7+9466yLen#`ZQdbwBudCDjcQF4pLiWv9qvjZrS5Q*6ji_jm zRlI9O#R_&$j+D>ORhKu--BJF;+`jU&b29m@C7Jy8QWbZ8&ca=oTQL8sQ|5kcZa*=< zeRo;2v4C1hctY-OdOyGIGyEDK1-BIO1K@WR@%`WxjC0S-5qy7{dsb)ILg$38 z4cG>BBF>4pR$wcz4cG?U1>6O6Lg)N>3I5f5+oz>#YEL`zv^%)^PGBdn9lCbtI)EL( zPGBc+H*hzw9eLXCmEd2^ciu0NF6g?T6MzD+3vpeD>jriM1)u=z0rmj9pzC^2f`2tH zh+2O(seLZwb3M-0dw?FG3wjsyZlD|J0eXO5pcm*uK37Lx!l86sFA9Q`dKA;bA zKE(9`dx3qxK43qvALxV5H!Z=xn(s?%{VJ-xe&q50B3C~E902;E^FtQ^27m*=0pK3s z9-tq2{8{fgTElbznULDrS)gC z+Hd~i6(jCfnIWE%G1Nd8YA>td%A8<_9ShEJ*!{4}jCT?IKLT;6$=A#oo$zB}I?3YU zml?-=7Gd$I*<=qm7OZjR2bUQi#8b)qsNorA2eN+nDi1mId6k$yd>2!Bj`d@(GUQ|pJ9jA-YZ zPcGZ)*qQ{-uRQnO-MLew9rHSCZ@{Rw=dno+*3N)UZO7Ak!NKwmw5s_%y{|b~-ho~< zZ=dCm<8IW2gJm7YQBceE&;y-0}*2!15YBV)=dg zpyhQs9su71J_Nq^bvj{vgFa;aBYM>OCKatqbixhp1@8s#U!s%N6&kc&rXlOMXxRE~ zIvD`p13m=4_uDjL`x6?q{TYqfzDuWU-=hbV{soNedfKYj`?p_K`xkK|-i!EN#P@%X z#%l4}k9h9|GU|Lz=W-p)>ZsqAB~2XxjcBO}fFo;Jx7e@6n9? zDt*}gV=CGIfoAPLp_u^q9`GUXy+5I>ULQogW;JyT&(s)d;bGum;255vF+4-}0q+AI z1|A0954;~ZhG%N*k_7*1{_x8Z8Ha8hx+B0Nz;VQlBklp<1HdD|BftlN4+6)b8~>UF z|7!loA8OBL(K>4ac_vT`j{=VZC!m{v?jhhqz@xyUKoKYcCy;028xs7h`J>AcnS^c< zx)3k~oJ8Ct;)1{+Fa!(%!@w|b61vH6O7O4dLw~IGXOnea1oW=9J)Ak$AHIxam2+DmjEV!$AHIx)4*w99J=^B68x+A zV}Ggjt7x5>M4lvSVG5W6CZS70Hv^mjrhqA68khzqktg}r68x+A)Za?@4dm5#GSEpt z37A1#25}Dq9|mF;$VsqCzvh^>k zq4*(2U*{?{b&}1E>UvkHsYUB!!fI%Q&3)=RSgEOTHfO2pVWp-f*&L**LazTDo4=YijAbS+A+Ao#OSgUQ+c)Y)c^1G`HgDo z|MmJ^uc?nVHOYtmdwx0jpYzKyk@0`nNAa7E{PEw%WAO1k@bNvc={=zD8S`vI7YF{2 zt{m)VEOy}MSLOJ==BFAzso}qUSV*%|S@Rn-`$dgwbRDo+%!k?T7O(BI@E(ug5?t5^ z;M?Q5TqZlv(*qKp4GH1&OwXZUB$`esyI|>=%xo^&lT03;>CQ;$W6^Mqm(tkkC`{gJ3U;P!bVVXrsr4Tb`t-VQRFNMWCPfM*>V z9o?sOgKzIirc+Zwy1;=!w{LW-*Rz70 zDY5U+;#W%SJ(~Tg68m1wUc)x2ijG6dc?GE|v4=Ffxv?l1rzbS~f8D;(ep<7)m&8A< z*&9pjFKYHTOYCoHc7KUIui1aFTRB<#_&d#hr6m3X&ED9&7OI>Wm4U7xrFm}C?9P(- zU7Gz$NxK2X{y|}3V|tR(>?0-iFKc%E1hC=w8_mAK2y#379hrPkD7BAY$NtOLv47_} z_HEc?+ARMkuVa7gI`&_`j{VYg?0u}jaAvj$-*7A=ezX19e};Xt{n!TN*=#?S2&r@~ zN(5{=V>_D&Q>j@Y9?Zsx_Q~uyi3n0U63hjOkc*zoDbZ|;8s=zHN~17DA}9+)2xYTG zP+yM`!dX?JQMQSV&HKd6>DWsHgzcf{f+1iW&8TN867g6f2Sp-9#1nCp6fqP< z?GfF6uU~W#F%c8-%1sp9=M~*V428s0T2u;{yWd5^*bYym@HR~>olK@ru-coewSPHmIU=(8{`$C+>ps%++(*_?A_@1;~N!!OOLBR zAR@79WBt=D#aaF6?|OtjG9Uf@2O%{(BeK^EwzMJ}t%^cXiM=55GbokI8WBT5 zNlHYw)^5=Y9?IJhTk5p(I>zyEcPgi+y^DLtzO|R_3{R-DvDp8g)}Z6$(|4)khT_8pW%pFeq(;%6)6 z(AWF=_|<6kVIZ^V`i!R6--|uZE>O`?(mc5bz76^jHCWe|#+P!o&MdXD47y&;w*FZf z8-%OW#-cx}{rsTf(yX#cZD84WGm}34`u%?z mHht_>h5A|3s^V`H8*#3;tDC=olg;!mX!^bSjjjnw_5TCCqLRP> literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co deleted file mode 100755 index bb2ed6cf474fa4ad7be8d6a0880961ef729c860f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11800 zcmeHNZBScRdOlnUA&wD|u!SWdv;qqZ7=$DQNEom-alMX9yaW@6IK&ryA{ilRBw;(- z6yM0iPFxofZ@NvhJ3F4Vo21QdzP3%X-5sFWoo%O^ZcEbHI&CIRJJV_ZFq7R*JN?l{ zea`&=S;ny$cXt2edc<>|_kEx99-VvddyjR_xnCPO#8p%T)I>T|RxdMr!!eToSsYN1K)B6v(ve+FTCmuMk3j0F30CG!Ep4&>0~+-Oupny z$6~o?o_x!hOME4&eZ_e?n4F99Gl^8>J5mTAnv2Dv*;_QAz0(HtciMpIP8+b_X#;I{ z+JNg$8}Qv}1GrsoaUYylZkfYUsJir)c{%hbFWx{KQ6rxf^*!fwHa(Z&bJ6M9Xe!Ug z5@(|ko^{}_N61^^EsdLZ!yQZ}rc>HKxxvl^qkrg322;~>!RhGBU(Q5R_l(%@d1J%G zpN?j8iFE1;N8WbM2G8=bWH8U4NoQx|4zQGJJOruWZ1jV6{)D@CHXY?XxjfqABk41# z(bLCx_HQ&MqKiM~;uDGQJ2TnD>0my((Z|iNKwfd?HXg%0Z%8DUNhFiflSy83<{q65 zrIR0Y-gZ9cOyq6nTq+SuXJ^Ik^PByz_^xv<7tQgJNAV>RVIF-C&%A@Hv(brnSKp9W zX)j)Lo(Z0g#_gG$rYH61(a_j;7VqYQ0@T+L9N z$_`d9>r|*-@_S=qrxdUcU{GmDKrX}q$xJ4golv6GeWl(qR6Z7*a{~Uf=tWzSU$5%; zTGLwlz2-qyFWHB6NQKWkcyr<9S>hT8MeLjCU)E{w+ zU2#q)@&mp|Fy!tH`FdUcptsk9KzFa(@9y<`{4Reu7K{Y_kx!KRz_`2OHL`$#P~LZ2#r-mELWO2k%*Sa~S)T-;_!U%|RTJYtI}1>2R@)c{x)`t2 ztoj%S27rMIZuKybXt-52Bo_#0uOPxoMWttcg>bFsRxhuZ7EDW~qUmeio}M0)U|K4g zOjp>rsZ4aGk1VXHZsuRGELn;cQD+e>OGS%C(pjDt&RZ-Oo&{GeG%Pg~8^o9fp<$`m z&>+P$d|hZLG%Q&fEP}~`tjMdnS$m(+#`%VZ3(ulW)q-`&TC|FJtb%o^Xthdttlto< z1?y6SwLuVb2@Q*uhUWzpDXgfF(*&*pHx(?4S~Xc^&&<7QZWTbpvAIyUSZAOs737H( zLLOf+J!x31G1Le;L04eswT4>ZF5#|%{;P&XtwAeP3)KbvqJFVnUoY$yb{Cd7b!$Pb zcHsKNSJm1$5k@~$ks1|4HK|b-)if?3S5d8wuL7C9lGIc()FSyU zR>@Co_KP;VqqfN!uia^l7u9yjZ*P|TYKQ%zqt#*9+Z;FStcw@PSC?7E*hQXRHj-zS zFICoDsVUSgYRD7syb~&_0y=d2J)dFR93+y(N zgPM1Vc1qPk4YS)}cUT-X4JOfEQ==A?yc*=K67y=VXfA3NHA3AY(NBI{MHYkrL-m5d z_ztbachpF{Rxk0|T@tT0Nxa&+Agq$-e)E+5!m^4yxl$mt4mR#pq_#>^@#!)f`|JI6 z4ni!~s!^ty|7ls<#I#(kqo}D9$FCaS<5C@m>!h()S83uzZMjy3+EUbNM8AspHSlY- zc2%_|9#H(W%ugl1p!n5gezn@Js>FT6{3ztCPwbv0%XygB=MZBeTgh+5{3O)lD5!D4Z3n|6_RZt~JNm0m3n!{o=RUAXod zj32(681LGQ7rA$nm+Cp+rd^C1+~lQkDwCYYx|t`BYtF{dPX6P4dj)540Na3Vj2k#h zC*wdD<4v5!%Q(=-xSO*KG7e;VM|Hu<^qtjLtW57PzhGthPV<{qrgyZzV`ciz_V=tz z@9_VNmFYYEzpyfWbETy3sFd{0MoHgcl=RIllD?xw(l_@?`i@>n->Q@JPMxH0ZItxR zMoHh=CFz}AlD>6F(mRKs*9B;k?rz$wJ4pBF4pDm(_#W^U@V$qqLwA_A=#J1<-3Z;Q zJ4zkx;QPS4z`KvqHp3`wH;mB^!#Le%I8NJ|!1sW+fbTs{I}H=G%W#4^4U@FnaFTYm zgYN_H0`ERaUB)TuHioFj7@_yDq-OajgPZA1~C`JD%n&@qz`n3`t(w zK}an_9U+0Mvj26Cix)Uf@!!q-X#QZK7B;nbUKNmVhajPDRl*8(Pu8kz^8r<1{%+NU z`GcxU^8$HxSs=f&Ou5VRJGjO91?|%gfx9w)h-lwCTw$&(lBbr5osh%3{(Y@#r)h&9 z0pC%=N5FL@{1A8rpq*~C%czKL8y}-Rd z2Xu~Kx52-@*7{}J4Yj8Y?X*44nL2 zHqr@QCv;Ar6WEEkPQ-NqyMRuh6W9&x26jT%`LGTC^))9i_h*aR=R$j~CpeP_=mEN* zcR}w4x`7^`2iODb0lLthD{O;*ea$m%BVOpd(DecPfL_FT5!Vas1@-~^fc?OJpcgvt zj1B(vwZ636uaes9Lp#2wIMV=d0O*6x2b~}22Mz!Sfct^_fj+e3%h}*xUmG}MBZJTl zLN^Q?1`Z-_5OG7mA>c4@7td;yb|%J0`p1u=`*a81E$bzYyY3lW%xubij{E@g$3fUtruz$P|l5%_jT7FPh{!RoO_mfvd*H4?LE&$E1k=4>O& z<83cAk`B>+mgO02ztYIs_xfLGB>P4CV=T|0|IJ30$5V+sKfJtBS+If1pKb-_!d-BWrh{S8BIUcX#7q)P_ig&1?o~SO2j34q1b*OE zI$?O7K4f^4o-q6|ec138op6KqfcJv;zeOhvZ_`H%e@0Ilen5G{GM)5;?*|_OKd?-v zjH@(ge20dNKcZpdkLifmKX)a&o|W8Y{|-GS^>50J_#VXfBEJ8}G-CWq8a4hEjTzsi z)5f3Dh#!1E_z?JkpVGMbZ)w8(9zA9LJ33?j8I8Nad%%0a`+r80=4*7;{0}r`{*b24 zAJL>Ad_VXQ_<@fotJgbmOns;9p-m{+j%3mdvvz z(9Q&E;R)ag-~@CN&^-iv2zUZ`0{AfSVc-PXnfR^^{`IvJe`+I>&`m;j5_l3giMUC` zJpz0LcoKLL$OC!cBy^MCx52-@c5+4T&ldB%DYQ3*S{MR`fK$*+_>Lze(1fTw_`fHS}uU>xnl-?zcPzIN*GZ6pa@61o&H1xz9?iMUzd zEHDL30n@-VFbQ4qA8qijuciJ;?$1{9Tvki*pLwySj#hO|&KPIe(PO*2NG!GVQYJ|PBr1`K|Q{(I%B+ZM(nwn(qkgey(lYAVZjC-duD)l}C0|Hu6MRyFlkJwLxyP5ouh z-<6vBcy+Zf_P>P{fe6d&_ZvJ3pMO3dgU|1Q&+mcHyazNyINim@UuQ<;5bq$K!N4U04UPx5e}MOm3jN z8zep#a)#5h-DAN>G@TNc@zS%IxqP%cnLIt)mC2@0MZ@Na!F-T7^U;`fB>bsF9*RVY@QE0YZ8=fi?e6#UF2aXGd@9Y0MFtkhpNc5% zFqYvHDQv)rrIX3@8P>7u6h05(qi3Vx*wwmhqt|iXQKAxjZ~tmuxJ*VkvFfkN({54hm;#ZkA_T6t*d0JDj#B2}QGP zbIG=K;(UD{kaC9S(0-B5rHRg|oqYIKB-n9vZf7MC>Jo zh;1C%raBu9&xt#1wyDE5vjntxyknC|!uFbE=EMy;8@EN7O}y@9w(VWhZS?RKdpY=n z4;}sDefQq8S=sLFjJ*cf6iO`1!1DcpSCxsDoTiM5TK4W&^ve8B!6{kC=65WYR_4mQ zk8Q*JuVOK+=#}}Mg7|q_JQTm8m(Aidgte{sr{wvdg5`Rp{|bHqT9o)! z`NY?zLV$bmVKycI#ut=3D!^BI1WF0cyYXRqWq!Oduaft@ZtQPWbY&grm9^|UutNxl zu(7W+BT;EDEzXtk)64c@*{0|}2}llQAFN*4Co9L7r6+7&#@gJ_CbmNME-O&?3B2mQ zd8^pzH;}H272|IOD`!Ti$=55IufX4_!8A0fcBIz&R8@m@4Yc$prK|yf6UbWbZEGq{$F-yQIy{+7B?U-aps0q4PAQzJGFRgo1z_a+1faKpB~y9RbMcH$NPXiypVyAOvF zu~2mU1G3XZZ<&I%?9w^Y=AA-3~)j_=LUqSKKq<|G4QaDpMmpMyFSS3b@y;$Y$EO(4-Joo zVo6_kJ!KN@=bcHjQCHKSu8U;B6x z@qEFthhhVh_xZ%XfiOpp?>>)jWaQiKcw%JIpA5~9a^Xv`m)zs?mvY0aLKu&aM5E&B zW?yuVKRg;3i@xn%LXYml!Ed@JVk62l@OAgZcxc=gd>CJ3WXOl{4;_93 zztH?Jeq3OFRfyRG`HK4?|70kf7{iZ1N7MVVnwOF|{-NaSkcNk1tpNHUVYRjnhmUl3 zv}^fGJH7>KQbdl+-6-!?79tC(^G~#0ShNBv3XN<=P3T^sW(P>mB&N+9THYqqKGoQF zumYMvE|9M0;G-+P^USg3|Mc&7Y^1zP9~x01KKVBCVM{pnOR>f8#C$$wB<>rU7zz5q zUG3fB)$N|nwvIL7H6ee#;ar!_Jv@?J=Wk!#wx(^U&9l0z(;sN>UfZ!Y*xuO@2n==k zht_mDI76-}U==wR z8fP3N1Dil~^tX?;R zCT2Ks%XEf4eTk(Q%dlrI8H;pU^*HEG+lNzZ0%up#@J$10{PDy7Q>?9^q(j$f`h zTQQxgs7RGm*ixk_d%EUq$#kluBvpn_d8#y>F_&Frtc01$%2LIt^0YZ^nqFZtrEX0X zr=QU28`65c3*%O1^d^O+27Xz{Y=wk+X47Z%l7iu|jJ`yf0g~RpYz9Ijvl%n`GKEgi zjNYtxfut{Dwjx3kvzapbN+reH^~#LeUd(L8)UIPTUA+AK<&1H8Lnf19ndK|Yv?zMY zSmnr6Rhy~5Wi>05le%ih$?9qs?Pp9jVQZ@uwkC%o7_6}}S zwZ@}V)p(SQxkK1=bP5}Dmm|})j@#7S(W%tbbSjz3b;4%ddSO%98>W%EzJ#Fcgro%t!r|ndsmzM~AxmoDTmkWJmmC#q(guc=t^cI)UTbhL4(jxR#9-*)55c;Yv zp|4pl^fg|gujv!|+ATs~+b{IB143WAOXy4Y3VrE;Q>hGl>d_RwOT=;dQXxBYIn7qM z=({gtCCiKj?_c8Q`xQ3TF2<_gDqRLn0GkVFC;x8p*R616jMe6DWlO*T4Yq1 zzWS|Y$Ws}UQM4J2uBS{UM`5v1@oH^SR+}W+Qd(O{R$GZ^o6*|Jvf9c-8#Ac=nX}r= zYJXZ=Wma3I+Mm{@&uY`_9fbzu18T#BMcDFe=_6w~8}AUT*LD+@BW z!_nt<(FHx1<2iD2L0;r{WI@mUj^@az1$mL*$pt<4JCh@yUywDwN#sj9R(kf^HwO!J z)m6YK&_=QqiIQZXi{v(4brZ?J7LxDQReMMVl7Fzc(nkLA;((3(gU*DF{Nv8Fjr@b% z7j5Jp?|#lk{=pqzvyp#%#~U{CS1N>mvO@SP&B8y~Ec}%X!aun|_$&K_e{!Gj4;X}h z%pm*&cHtkh3;)0x;U8Ng`~%yBe{7rZ50?u6M5*u(uN3}?mBK%Kr|?hQDg48`g@0nV z@Q+x8|D;9uM_PsdWUKIxY!d#Hn}mPlpzxnO2!D%7GFT+ZXep73EM<~jhAcytA?lvX># zl1GLtLzW@SVd)8m>)B{Fn+tILu}a5@$`vltt!B)1D}>G{=IbYP)eWiYS|wdu>$*_w zQqnFLuD0k~<&1R@Rx;N4tlIw7>S`s89H+Kp85Mo2l(BW`%1Rg0N8iFNol>vB@|3{J z8G*%gO>`7SraB8_Q(c9Lsda^uQz>@lQi`3uBaj>Q01ZGR&>RLj0-eA*U>&d?SPyi-&#^Ir_PN=* zyVUVy}33koEX5cE|D&XzF+ks8+Yq~Fj_PNO(W_tf^ENK)P z&za(OJB{P0v)9h7Tpy+W&Nz44sh^|W0Xtj8^>OOwOn1Ug@sI9E+u0plpQL`y?6_#B zeo__a=X>Wb-%?F$C0xg;sdsvmW2^*g3F_~Bh3GHU+-s+P&NqvGQX2wxihq8Ch@aY* zwo}~m`$XJF40ZM%{738}{%O02|Hv8<|MVIW|B-DX z{^@NZ{#0q5-OJ+_7>@lV;=2~&_r@3EpDPvdr&fyi=T?gNQ+JB^=k65or*@0@=XQ(u zX-l2Gm&Y$Q(C=_dnqYO8L|vnhAh7#?X`bZ z+GqcowBP>c(gXI_q`fj^8L|vneoZ=H|E6@%{w?W{{X3G+en~nYLzW@SkmXC#pmRp@ zJKvB3&cBj|oZpuQWymsQ8M6F+Dd_xx6mtHJ6n6ed8g~9f3d)dW$TDR4ClclMEx5L~ zASZ7HZUt_^wY&w_^4-9@fm?xFf%gFK0dB#yz2%Q1XrG(i`l$%(hhIPZwga~V`(f7) zyKTU2!0o{8z6AmR;vFM{^D*}z{%SP*_e_=SOCU=Vge*oA;0U>Fz%4g-gQLHGrK z7(x5oZ1`{0ab>UjDW~HG5zp=Yv<49C`YZSV<#OCW($@Ptx9_4kTCDrIeICBRD3|Z0 zxmfT5p4)M=(M5h*ZXcw%QSbzw+k-Tx<>w2zxjmOZXt_O?M`*d7`pM=KT5iwf6<5pc z)ZfkUjcet0ihnbFd8C+w96g}riC*lQd=U>Ej*HNrlyP1pxQ!ane*u-7}4344vhM_Pq_WRtLu92EAE31J_39QN1Z>{f7eD>%9p9Nn6Sv)jSZ?cnHkaCCbf z&h7+9cY>ok!O@+0ID0)fdObLLJve%O9?sqfj@}54-UyD~cqL~SoZD!=fZJ6knb~o| zY<3k^7?l&)&zLK@{xPHS8+|H@&Vi%t9j17-3#1%5jV%`p0~HpsN6gQZjR}P zW$|;)-@(s2(-;Twa})SE@p51}GjKG~e9=<4f685WXv$L>E%xeKz1WZhBlaLlFP_!OJ? zxba!(aufJCT_c>2?^F3WaH)Kp&#_s2n)CC6DnAF_5I@i2!cFN`7&QoMMvZV_+Ay{5esH}0(Dd^6r% zXJ^&#zYYBXiWlD@_)~nO*UnnFevsnDkM9J(qcsZR5r5?T6z4Jc>AJngm3N?7}{_rp~?&*GEsmzv?-E*c;>8{n@e-vXUSg$)m$*~(pV%$z6QjL$?fw|&k7nYJB7WDPh)>@$aZ1FSw1{|< zts>s!CJ}G)polj)A^M+u9QKx!w84^=dMziVjh53AUFSWx&b>19GW0U^@@c8hdPdr0 zJtu9po|o>jPD_0^YG7l@XdPg z&3f?7`aJxz5qz@|e6taJv++v)Nx8&}^}iQ6q!-u=^kTj5#d?1Oa09Rx*bCeU+z9ky z{qH>yLHpcn?_(+t$hybVhd6!6A)A4lfqn4ngWo3LCg5h^X5d}GyMTR&)AvUaw9n0M zenO3xRj=Uu6ghXC4E_Z zi*gBg7_Faa?Zk5l_?Wha;<*G9pthFcxdi;nMfQ2tRM|Xzp^r zF&8Rarqt4izr|#H_I%6y9jIF3MwqC97 zxD*bgYe?xtUGaiW)FIWnBIhrtLv}u^wtsbTdl%WEecg=Oox zVp<`7l+o#wIPL3nnw^*I3a~6lo1p_o!oG>P60nrG60kfauz=b5J7UVw2 z1r?BWPoWRuQqSg@LDJEL*Y`mZ_I0*zz>y6;l3}gfdX%cOFgV| zDc}upsYg{V1zakZ;8bOl5>DXP6Jt#hsjQJ-@i1~;5ix>Cia=kd`ZfY$uPPItE^^(+I ze21uo#YfT_*E>S}#E)OJQ~dbT&)HcI=YU74pZKd^SWFe(X^ZKlVuxKlZYSpD67G=cV{gQKQV_m|Kr5#80R-$%I;yOzal%6Qd%2 z;*^M=_>733_=bo-Y3a3ZvKRF@dPd+Zrqpz&r@S; zL`|^~HN{5M6dSLsF)GbJl}4I>CPkWmE*){O^(|?%^&g~I>o29T z)_0_+3|WROLzds+HOiaiT0BQ*anui`MwrD>Z?;BwHIB+_hse#mb_iT*?U2_9k(+sq z5O_m1LgZ#%BLoJ)*_ToyRB#VKL2ix!BR~cI3ON4=a0D0uMu3Nbhk*+20Vr=q&^|XC z`TGcq!Y_*490SIHQP@RcHwqjD#(**47;p?2g=)d5%^@t&!>vJV!^>T4NrLki((lSK-jft8nNe z8i%GeG;s(mM^mjan@e0RM_(_urug|d#sj&iLW zP4Ta%?n3cDh{I?(`VWn}-z=9Q?!OA$KgCk`|IEixb4-2o`REvYbPjxU4$Pke+B4e@ zdY~E`{y!z(PzK{0f7Ts)KULc$Rewaq|9C&cpGZ%s?Rr)JqAIK60^nh7#}GYS>QJ9@ zbhNj-J#G)4!*et#$z*(doh(C8CIarEu~B)aKNuQ|@rO{yM&lF7kQ|Lpj<&`VW0BBM z(vcV&YF+C_1V?+Pr>(=WHqhnq_}jWWIy>9jy8~SvYrFkxyMm#%?zWEA!J*ZwhgSOo z-GMdDtbZhi=fu}JXh8k_ed;hAO>%TBHXKU$;{K#U53u^i$9zcz54a}cqvO6If6O;N z5s!~0lCT&$>>nP94Wk{;wT1!{!@iMNc+7`~Nqv4impV4?!#Kku(a^X&o)}s*axN@k zxC=)x?;h^&V8cPTE^ZdTXA%2Cj=om4e>O)?&yP}^emmcp|I^F&ikLS?zeCl>bM#?V zAI#BDXnHXKF43QJsy>xt|BR~tZ#UnW|MO*4e=;Y|w^Y5Se%_St{6y9NE+@{rs=h17 z{ykM6y@T(}|0&=Hx`>(Nc@^KOQ1vtFbGTY-gQ{0@;&iL}ALQ8It?Em>S_FZIRDJNC z#rg+T{qh|9<6QrnOlq+?drj3}$kG3ss(0k*SL#Udn@o{axCcNP6Dh56&( zcNP8PSJD6QD*9htMgQBY=%*-w=}au;Tl|fSpwIWb^=jzzJ#QT#J-hOGYvzuPB}2@O zXUg%kI&%-lCftgDTw&^8(kBzlofr%HlYZt-hK?k;H9dh2btsw`L*fYrk#?9nFh0)Q z;`JBqp>g4%iu8;+p3NT_ofw4;GAX?ifYBrJNq+#Spc}C#koh7bNqCIJ*bp9EABo|` z7vZsJbnGGO$rlLtVq-p@54$^Fyv-xzYwK$7#Q(+39!r}C_75q@u)aVDso2-Z7+1EZK=( zhO$(rKqx`)k+=a*e!c{K0e>Pf5?WfnS?_`1?@(CMp!r)HCWnTW?!@0@F+P;=Cx?_J zdE*+U_eAiq;$YF6B9`o*2n|i}7i%op6ECP4fp-$ilW3ID`z7KN{LLKmCrXZvp9eX* zrT)T88s*^+etmu3#g{k8FH zcpH3aYSa9+b-RYzx?Ss!Py3nxY{Z9jntw_i;5=s8(ylj0rezWNYrJB9J&OONLfia% zv4kj$wSC|ke8`ti&p>S+*4{@IhmJOlmuSb&EYz$ANIK2`J#`+}-k+t}spfpF+CJen zd=~k0cEgzFpL@J=<(=@))lpo{pZ2r2Tw6aHb9H2Yz5bJ`zlWT%HqBqGI~N+XzBD|B zChfOhu);Hh8-z5Y9#}e#EAi2ezjppV2Ay{7g@^e2UZ23Hvz6G_`qk7Q$4)-~vzvs) MW7>(XI^_EQFT~F|i2wiq literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=192-N=4096-with_silu=False-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co deleted file mode 100755 index 57f2ee070cff88f86694ee7e4a1e183982cb869b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20408 zcmeHPYj7Lab-n~Zh+Odj(gKaQ0TSXvCb1wzN+d0_5JkzfOv#ii$&yS71PFjj3It&g zq-9s8RuC=As!Ym?<0#IfjuYEST|05oq;ZpotkX#|ZIYHrCbip)>;7n`nJH(S>9mth zTch8-cb5Pm&=Mt2r0j zQQpdB6|?>5SY@BmKDZL&TqJV2vXj*x^?TpP_3tZ&Dk|2QW2X3N&~PLDsr{4Lq?{xb6_Koi?JNW*aQmK~d-@L&a`yGWSV)=( zcus^9v2b+anRiL$tDeAcJmDW2pNu7cbm=8eATd1gnVHu;flw$Bo|y1Y!~;X&mxrU{ z!9esC&v+y<5l)hqJQE|2g^OPGOa-EoVgHGdSm-rw;tx(nBH_e!0uLi*8i)-~28P35zCRw0?dWsv zcw<@Rp9&`?M#f{8HRRi#vA{`xBpOKi9~n=aPzFG|)Zr$G1;)bfH29A;w~dX5{Vfwo z#Pf&7ABpu(J>aMR^+yd( zlf2@Ycyuf{9(||rhWqoJ2z}c#85@a=C&t+D{h9F>e8V$25uWge9>tFs8S-Q7Lnq$C z*;yXMj|&WMa4~xrU-di^m2t>{MYTl)&{#me( zZVkk3kf|=}Us|{6IoIN8S2#iQCjFzsqHS9I<=t-h@RJ+1#EBw)zcdo}4^57Q{B5nl zp{*Sq&0AYqL&4@Pk-WpWQKe^iB-z;>3Iv*5LuZ&R~m8Y4uGrRt7HZTPn|FG*{ww<7w8w5y++TL8ZwX# zL91Udp+N>Zsew+vP)}u`m&%3ug?1_feL!DTMZ$)>rq$#j|iZmmg}-$G{QwX3zySmrEg3-hzcmbtXW!u>4I z%V#Z?bI(H7&RFNHX)Cj_%GSBG)yi$GpOLL8>zu`Ekxdr#ihi}L#ru#LXRX$A&myLF z#x`e5+gLv~**2H9*?2#;&&sxxZO&@5%B(NhI%}~$FKbC^UW-0WkhPFaDa&k;jx5l7 zD{DqO`73l3BY>T2R`HNl7tn*N_rT zy!_&PTEC$-olcYVhAIPH0AA6XoN1HAK>J&7t&(Om7Uzt`;->rQA{)20Ik;_+)0uWw zGu!eShg5ELNNJ&(+f=)`jZo`M*VZ$e$_BSoX?08Kl6r1azlqzFG&<9b9%f?{H%Uh8 zCMjL!;Wi#Gw<&9JrdwK>jj7EmnXFzZZD{2-ty{Q_q1~Bo?_@UCyS7MH>lP_p*~x7> zw{e@wE@!&S$7~$iw@D7`HYu(1aT{MZx6$=D(>=SGjq9Fn$z`=jY4VAyG#hRpm#!Mg zGgk}GmduuB_+^R+uW&Ir@PE$ zW4f|3=<1#BR3i>ljg#pN2IxH1?o>;yyP~d|=_)FqYpr#s+Uwo7HPkZQZMQ+!S?^AD zHM%!6)id3O4bb@--KidryGm?ix~eLjENeW|^nBk=s%}?*gC4(O9s3RX*|c7NnVxIC zTscb&1MjUTGqR7OZbp_V7R|_1UtYrXy2)%H`a5#v61Ud zUamK_a=odY>#f_k-sCceWW>ET5+bXl#DwY1!HeFVmPUqAXA|KE;Oz!kI(|ktAh55p>g|lgL?kd4SOp_ep$s>k3WN#f6sXk}G9%W#{$t4&BVWdL0$vIx5C>lqQ#PZji{o}5bjrxZ=F59SoyyHb1^$+d)x{dnB_q}DK{!#_^PgZb$ zse$_^8@Ru;o%<)ZbARaw_fH<-{=q`-A1mbkK|A-4*|~qPjr+&ixPNdD_mA!2{*hAd zpD5-2k&WCxv61^n?&ki9ySablAoouken_YnA&Zbj$l@WP(REm8avc>st^q-G z9Tyr!$RcDBvUptZx(0=2S5Rnig@nznh~O0=i;zXgVnmoDxX1Mdg8}pW45@UUt*mkv z-EP4ASH*RDK8Md~EVZ)5A*CD+_a%#4O1a&*+oEsdF;XkVN# zn97>-(*(a?)?7eeb1dJNUlQiu*UZc(ai8jZZ~RO*qA}@#dZ3Q9F6!`u3W0?{Jx~u^ z2V4i#k)=i5dI#E<7WL&0VuGIueionwXo8&yc4nX%XaQP)R-hGVf}d%#1MN$TmaPt+ z)3S1*4RLH^8j}O)0NUVZgP$E}2ReWb;2pp_fHuUj-Q__0(xT&c9K;DfC;X~`)j%ig zoUn5NUBGH!HLwO)19ZaAxx<0>rN!!dl<{QcVmIQrAJ&*^fwe$4{M_){2;2y)1=a%V zfOSAO;<)c~pnYkv_I?MchhIJXHUT#Q>tR<9y9Qtba1(G7@J`^Jze-2v(cJDJM$1e_jzm^;W_kQ)gZpzV-lDtieT zr0tkXLR1f=<2qGbZCB@aQuM()SmsD{Jm-sB>~tKbT-|nJX8I`Y@4Rcjo%VC8BWNd^ zm_APXIp2}6)A*5Yz8=NvFvMtS@MP z7b|#waznSB_H(g;_ako)+G+fY+j)HXNXky*UOd9%K2}(5??Qg)DfkcF>9-Kyk%{k% zX5ybKh9{;gDJpQ>oJpOcPwcW?!7ZjfUQ^a>?;&;b0@h_C} z_|qGC{0kd-{OP-S{0n#U_|pe@{0j$p{FJfU-p%3{7t-@kSec2xEt!cwYvl1$O+5Z= z6OW(T$>Y!NLL8h;O_k+-tln>^43t+-H1Fpn0?hd9+uAUW8tRUVKjIGhPw) z7@rsR8b2%CZ~VN_Cqfn>i;%_73;pI7gni~0h5hC)2@jZG7WzfVB4iP=__A=o{AJ;w z`45Cc=06l3G`}t!5Fv|@Mabgo!eRT@gd_H^3rFpLBs^sQmT*{vEJ79`i{BCk?B5oS z+5bd1ZvU>}w_g)) zyY~|gv@b30{*;6C!LJW~dx3j_eX#3;-5%f`;9lTf;Qhe+fqn4n`-}taON)DdU&#?! z>&||}=|@i958MyzhhIPZ_5t?+_XGC>9{@f8>_?pbFFMe^w7CBj2RQ)01MoWpJOn%d zy92O02s{Wp1Uv+M5cnYQ0Q?TT>OlL_;-Rl9BX;CCE&95?{G0oWY_9s?c+9tZk?e&7K72EOA! z`_kg^d1X9#*4=}MH;9}Z1O|bF@E?SK02lxUfkEI9a0obvc!S?}pnYjE_~#B1f?o)J z5nu!uf?Wu9VPF^-0Y-qsz+qqrexbi~pnYjE@>j~Zve*4Ir}rwky^pQ|_`3cYK0tGM zKb_wadRhc6JC%Ma7JnDYXb+Xv`e%J~7y?Sph~hkOf2Y_p*3v5_b0v5NVb;=T5a0?#gXiiO@eJZ=JcF2pOrJp*mY+cw z*fWSr$}@fjtWw2yP(&@;5xc9|~<_n2pe9ucw# zS%fUk3cYslO*{Ce9emUNoNx{Qbb)WWz&Bmso31?kvj%*#27I#ye6uDG|EvSwtOMVy z1K+H>mVeS*;=}sihaA!k><0R<-uEFHZU=4$b_2VCJAgZYKCJ(JXB}u?TI_yO;Q?85 zeGlUFAcyP%?gI9}uLpiRfjfb_fV+VA0Pg|zAWqNkInchexNA;{m$g=4{1iEa@l)Vh z`6(xxY=tgHJ8wM*=sE|mr$ORz2;gim(aMkvKG5u zE}`SQm9^USatZyNAH`Zu%_Z0HmH*>wx-%6O@-Gy=u{y^nTILuss3@Cb ztdym*%QCaeaExWW!ZB9L$!q;?mSZ&H{0=_D>yCq8Sv%*T{-ICvx?AXtx7N%-sK4|D z?k~N`{nx@tsDJR2+&}ml_g@P~q5hH2asS9Sx&K-?3-up;iTjU!5B|U99Hy}Wb;btN z85>Y%Y{*l0Y(kx}33bLM)ES#rt2?fR18Hhy4X-O+((pQ@QdeaB1$D?RFDUI_TiM=D z?aOFKR7)261owdcTZE?uP@M?ej7J?f7$RTsgXW~Y0w zrrK3dQJ5xAG48a>0B&@fVOj2lT$&~ChFqQ{_dqV7vgQigfg7+|C-5e@)N&qUTnf*PS*;UzlU!;!o|-=ym%?*jRs#jz6qkBb;Znex z;!=+*Tne~WF2!n_XO!9|@TO{;Pbjrb;F@cjIb4o$u`DjfIGMWsa`GBvHkYHdyKF8u zNM)?QNEg;`c&ZViGB_Q%2eKE%M=E3fiBiP;L;H&tcjt1wIOA?gEiz7Pk(}!#X@Bv3 zycQN8NvT}#B<&}D=CYl}kAM0_JLzH^@D%MQ{>Il?tutAHerS!7ai|N71MWhNPXGs` z{Uw!}WKyX~CXeubk`MEmTkGwH$B#YE&MpUlKhC^gB1Qj<&^ zS-Q->Lnh3>U)T91h+GS+Zn;_j4ul`ANC<1e&jVq7ix+w)D&H)DZ2917;8{dtU*n& z1~tW+JT=BT)D-JbQ>;TxvF_R$qtx)%!brnUh0%te3nv0KczLKY#5ki~adjq+Bx7R%AmOpg9YYJ~Ajyj!givhn3vtFqc5ax<$P z0@qqQWHmzMW>zBv-c*ecxtY}nfgy1AwbTeDJOhxBn@548Knea5IR6N61UL#D1)cz& z07`fUApOjN_NB$qzj2T#{G!OsF<=ZBgX8B1IK|;_(lKDf%c`v*x%z> z0?pBMp+ldyf#d3sd+5J4WY61BOI*Q`QA=FGkq2`)awx-*S&o)?t&!FrSdNY=wZ=Rg zA%{c9SK-jfRXFrXl|$1tH02O#j;6K7Y%Z}{j=oWDP2=a|7}v|uH2#g$MQHqd9OQaA zn#RA8Itq=SkE2{KN7MK>Qg@;8Ka9huIr_JayWc98Ia6FLZvyw1iHus}8Ppu*kN-YC z1|MGoA72B@*MR!Y`g_U_{yinzPzGZgd)FU(m*@-U*aQwFso=kTkRGSIKBZl!=p{u~ z#0=nNb>|R$JJ+eaD(Y-$@pwI6yvOLQmy*f&M5ic1kS2qkq46rWPK`Ci6XT=dp`?c-ivHOg{V_$~j^$qVmQeJKIr@~MAIQ;vO40Y{=wDIwcd(u1|9w}{M|13dr0D-k zTsCDpzf|-yIdT3)(R=ZPZNC;z1AFXiaX z8mjnp+Mc7|r|5IkMq95n>hPnl z6@7PHW7O8{-@MYO8IhNNz}Br^e``)V>>rVk+x@{XCINqo*W2R95V?+yPt|01-3*Hr zQJZtawyPns!cwoCxKx!LxCO`en&GtIhv)cF)e1lKf1_Wx@Wdv^{PYtNYjmNXpjn$s zFr1(t|5&phySPIBU?7ng39qHktWS5a4_~Zdxa>n4Q$s^*bzz?!nHWk0l0(uOd~ui4 z&w8*?;t%Ulut!cHVPzYdfcF z7~zQz|Hy{F=g8jed+yz_G9BNvM5jNx#AVYFF!%dSU!wUxYs(RtFI}6d{_6T$#X-fF zuD@wt_&1MiQ`hCF89i~1cI^}A%t-yc)` zat6?*__Apo`|3O``?y1#1OL_;_SNHOQ1o4jPW69(kTP2 zL)AamK3Cok|6CpIPW7kz*;}q{0FAjiYJa2tQ;NTrI%RFDe@9MmRaEgbeAIKF!?%nRBK?7YI7QH+uovRIXv3 zj`fw)ss6s5xS&#eCB7Bt7F6BBDel%#ld*m&o2nB)zDj#qyV|&n{%dwd8JDlvPp#ck z&?)Vy-T11&+G3w>*5O_7bM+u;s8Ujb*bNXDumrgA%NSVmod?cBbW@DEk zGZH(K|K3mDSFQ=1{afx}A~u&a{`pOICYbnRPa>F{TMW)c-gtN+l05Er9e;1jBwdW8 zGqF_ihCn{>%m*(?vx#6zq$TByT2*e9}znv1H)l zX^H;@W)*CePPa<2*v~u*>Da|!HnP>r?Jq&z^klZKt*FBlX=R>K) zr=C5u=(#ZSfoCxpn@y$Xm9HRee}|%c-!F^d4vV z{dmXoSny(GHl4yTV1wyB9au1ElALdfVGN+rn+F?cH5S*mI=g$c(qEBNPgE2g99RUG1~pwzeZZ?YkxQ zbK`{S^QrgW-9kke?M3g9s~9-#ek&Il_@NRE>r5&Gm8nCqVco?zun|afhV@p)f!&Pj z4eLI}fkQxFk+2>BUMGU9h{$V2`qrg2A~alHe`f9Xbiz7ZM1Q4nB}cx!N@PNEkh=Qc=5Vr#Zp1nipaG!LY`O?pR%mT7K>bFamXgwxhy_awj!66 z$>sR$mrct#bNLNI%80SNTrQROFPoQ*E7eA$e3x9h{H&mBSk~#>=y!BoXN(d#@QY&N zC}yZ54qZ-1qv#IF>2%R`AhYX@BWH7sO<_a^5 zFR3h#UKJ`_S1T&qY(HnL;;~iLJl1G)^`wNy05Z2nxi#5rpCczXss((>r!IM z>m1SYN=G!ObMY8gJ&)13T{(Ay5@R`7AGK6gM04c(tL%!XBCo7k$#+-vSIjHs<=T~N zWo7PVv)OI1ltqa-@R`9-B44XAyO(WMZll$#*o;QlYO37JwKg}kS1C41VRPBs%kCO? znP^jNWo0F@EYvKq@h}miDM!Agv}qtOtd@{JM4S4RoL+x}jfGyWULoeG&q~NO*~d_K zO_muNugT25uZ-LGnYn#m6}Ok$xV_xL?d2|RFLiT!X(P9nHeHj~$xGjiyRNSmlc&~} zNx7T#tAv!57>d4JWyfl#zrsyO%_pTOQ_TOIs&z80Q0~qdDwHw4f@AlR!Qd(_F+_>3 z=93a!A339e`wRy6OGcxsxYQ8!X+E0gqueKJK3$$qr*jqSab7VW3K#1GJHH8edQBV0 za*X4CjAM?xh;c0=Ih+0lfLs-HZavOPnOs|CV<)s;(_@@vqQgKl-ci82x49CBZ?420 z-L@;-SHK6h`3~LCwq4Qr3wU6g@6e5H+ZEkJ0iW9Dn(iWw8ygoh`Sr)GMM6y#@F>v6 zxLv4mFb;Gv-X_$z83#5p{-99P#5j=YJ4*WYt-XSlxC z%JqX*uJ3K;`oU(d?>)iwgD1GY_Z-&`o`c@xq83v=U%vK1n_HDcWM6rcENa2rhz))AU(_ zYu8{jn~QM$5$d`^%iX&DM%3$aZZq&2|EwT1$U;SQ8Fk}1p*p%;U5%?P@d+Nu%Ngz? zr1e$R|E?fJmjyxddzl~UJC-SII%Qw)<5((l+_%oLh+Pxq#kEVV#i1o{@$6DZaePT8 z*H>losOZ$Z6(pAHPTNctwUBvj0-9?}3a^$I1;==Q+;qwi><&!Pm3tpPXkAm;Z z<6YoIj0?ZJM9@K5SW?lMW6_@A&;#+4*%vx&5<~9K<9w27FY{( zAkKj}C(sG31=a%Z2Hp*HK`tsYnpbV28Wt`1lStOwQu z?*ZNebRm!HcpU!Cjrs@T#0{Mrx&~ka(2Y1Z;tl{002_b}z=ObpKsR*mhvM*WZZtfs z_Gg!SK_l`t#su48;9+1R^o`IT0v-Y$1|A093%nQDhm`*}4vuGnW-dlPkDCu_&w)9)mQ z6?>GmGto2ZWcmH4LQZm@Vo$JkCQhZDtet=b?fmsKYn7DEac~{WOTX+sf0-CDZ(!|B zS$KPaL-kJ9&eS2^PN1*f$?{M2@%(|aAt%c_b(ZHH)(g&V)Q4k5|DwD6bL6ks&fhz; zoqt@<^A9_D{&6SIKitXlk9YF?!zX$E@sm9Nh)HnvD*69Scloc7zj{0W(dc&mNfXaM za)9TbJizmh+|Tn*-p})ooZ65x$^V6p9fwL`JAYSVJO8wm=O1n6`KOzC z{?QXW|MUr-fAk#BKYb4QO)~8>Ez^F}ReIcXow7Q540Y5e!Y;xt!Y*E?1E#0w3DeVb z(DV#_z_daKL~s#Y1Q%E6kmWi0pydV~w!Amvq|xA(iZ_bV>vl z!9{R!m9pCI!?o>0P3{Nw1AVxbeYlqUfPKJzU_bCU@Ho(iYuk4v4*%vx|C4bt0Nns| zgTO)H0OAG^cLI0=*HfR!@s#P{*Kz8UFO{r$TxwSJPDixPC!2a z{aN5y;3RMo_$csE-~{qb{3H(l=Eme-#K{zNQ_xKVr-4(5n?l?<;5pzla2hB9CEyfv zQ$LHtzqv8JruHj;-p^{fPp$0(Y!1NZ^*22auv(5E57>O2&G(hs?q~OCKJQoJcX?hQ ztd`@)1~#u}&jm_tA7l4Y{(PX+_6c@xCy33a!&2HSc% zy0AFY6YB0b(i1$=Jril`Y3pd83AeY0+k>H=Q0F1yk0r5A-|J!>^7{wWZnzp-5~<`| zBrPojvr)E!C}mPoHi{L-^ul~b3I~%?W^rL5mChm}d?7d&OU}WMg~do{aZZXQXHybZ z^Q0ga22&Xc{hW;@BAJ#lq=bB4V4qdve^FpxRP712*ps(Ct?ZYO-xS1uTeZ9E z@`ADZmTLcZJhSI-C~DYBNK--lKPdJuatjJ>|0`Ao?u*>{ZIiOst=e4$_Oxm*Fq3z0 zVc)!ky&mfUgnW@J%ya%0_E&FV|L0rSZCJzDEq~K3?7myr|CZSWBYOzkv5mpL+xo8z z`)=#M?=U;LdHt7olBsNjc(BNg<#XbhOD=k%!Az8>|Cv3TCZ2R^CYTKpPd0KXt3u?$&n6yECSpNRLHPP^3@_m+q9io~8et-O!xFFbDAWB~`SnNHJJ?qnv@S^hC%$}8a{VWD`7WckHg0R;|78aHF zZnjR80-JIk6xjB9g!k0T+q|104V)e6JNe-89R=*SB&?jXsf|*0fral4ey&Y`)cgfh z(XxB8rgy3HBn{82IyUcOZK3iin>HU~uV?;OE|=2uvZ~h*KWQtQ=GXM9S(zTN+?rpT zr)gNI*ZQyF-O!?xZ|Svpp@!PLP-{=Q{b>Yv5FcjK+LzT1Zed<5+w~Rj7I3E5=B-=v z9qoNsZJ;z=K?gcjE&C42)~8|}M{Rzf&3m=+!%~H^9m9WS)AS3fUVA_GIjjKYZw1Zc zx5Waz_CBrJeqnqeABDcq#@f@`XZ!hEVQdi2LK}})f;-6UxkXJgW(L@N3n9wLQjCnsZ>dn7PCXaduNCa^we0{4R^Xn4>BEf1QY z>p>Iv@0sALcFOegP+j3E^LF?-NjZTwq5{7t>IdGbOnP=k%EqRqW2u}pnRqr9mDq{+ z=K=Dea!TXm-EfDJiK&#~_wKMWLGZV|$xv!)HZ&D``-z!Y>O{bO;)4y7bUBvECeo>! z0{O^09eP%pOonpOGwIAFwF9h49WH`YXgc=EL20bDV>%s^eAyiGNzwE(so>?a68j5I zD%c{OZIKd*A9`mpiOZo}Y@?5xUxB>i&2C(V`##_#JCjHz`ISlD@@Ai#4yTi!G~Rc6 z-b>MsytApqWI8jgbYI%+f64c}v)Ncyiav)gk%&m>d*srOaCA00@n?VW0cVB1c+dMx z=yGf_lg9Y7f%JbzEhsq*eJu9_u&GF@3BW$6G)+xYlh1ay`?SKN4Q+{9%}|_*HdZgI zOHsYl`^LZ3@BnsuGneX#UoFIbZYHD5PAJjpy;g4-Y9BlH1#*!Pk4dp5rS%M3Ul(87VTGh)Qjt!)>9quC`b>7VYQ?`;=~Y zrxLj%UC~guwIkfs(b64i@9_CLyIMP1yIVWDecdhHk;zar)Ey0n+YfKo@c=glRo|+5 z|JyE9gwa;=9$CO3X!pDA;{Gpfa*2LbX;09vvOdW`D~Mb_9c7XuITKteJO9ZbDjM)`KsN1?G^CS1#wBti%N_ri%WS? zz-j3tzvSZ${ zBsxS{X-gIt?c!^4DVbj>MVnUeQgG|MebJyJtL&ONr4v>GL>ZfF)r-|;x>7=(Um@fx zE7ljyiS_<$&l_rZY)vhXHCWwwt6hoRCDz7w?Wv9D zsh!8z9Xy7*+o_uYYRgoHU zW7$GpUM{;@xmY=0z4&}(rDxt`^61T#abgO7s@Ln}=W9%!d25ZwU@<8+g8?>sjc49v z^-x=lVxtr`x79Q6v3n{-t75CH)XB17pJU@;B(+AZO+DHySK8Ds=JooUY%KJ0^&&Bi ze_Bo!WIsdQf-EyMEXd5htCHJynYevd4YyZWxxLE4?Nx4WukdhtMFY22G%m=iL!J-!4Zj${Y0Dr`LOK84T{y3VqzK`DlTUa-XdEbOk=0&RtrD^NRUU>{%b! zc}>VmD`i*97W3rVGLg|1d0|;!E|liUl5#u^vAxJ|G-7zer;Zx!qw#(@otpAzhij02gz zy`sm#^!|z=2h+Fj4Lg|Lzc=Gx`u6U52h;nzZ#tO1{meHVOz%JQU0k3)`?$-@^}S}U z?>fl!y$89z>p0i<9_RY5^IYG1p6h*OTz{;L>wQkHKj!3mUkBG8>)?9dX|6wZn(I4_ zT;F5l`p*4a-?N|VJ0Ioxo=3U9^BmXroa6c<7OwBJaQ%@cuJ3E&`XeW~zV9U0AGyHw zeHWlNx~bV%N2`qm8Y8Amna1mVWqz-czb()V*m-#4t*nEsSL~s#Y1Q(A{x5ZEQT6$=mrI+rr z^ij76E`p2TVjuNb`ssel0IjzK=mE2rUKXn3^Qa{^gxdIgZ7r_0WPw@XOPE$C-FvXGhU zB!<82D)~g0CtqDAZamMM{$Z_b&+QF<1iYev9|hl4z&pW97#Dtdj!jf$VGeD5RjIQT zH|Wa03JWVaT&E`g2Opc9g0&i04K$JVHItFek=xhIz-nMMa1U?~&_vePO#58$udh`f zaFH75YM`qH)&gq~SA#eU&;qOl)&d^_J_M|RuI3RJ{OfDA%`RTk3TmMhd8{G9Y6seZ zR_LtI*?=~n9cTxNKoMv~9&4Km{`EEcVHa^g=YY-ybO9ZRb0E$MbOK#K7w}==!$1dg zjxV|3Ute>5S?y0jEp{W1J0@7`fOSAObZ+SO0`~&zfOWuqzd(1^V z(0QP%2i5~Ui1Q$BKX5;=9#{`N06YNnK<9bf1^@b5{S#_`wz(HHAYVg5upR;)0yaS3 z0R2JWLEs_aA>bpxM}Q5;*KpPa|N7dYCtaiwx<=@{KrgTnagB&;0yY7?KrgTv*bHoh zuJI`s{OfC8N$pqRy~yy|Z5<}sGDAFr`B8iOQS-VM7$UNVfban9fb6=c-#|| z`0egthY*MBAi#8plNrZV7-Tx!JII&78weR@esG!bMuO)(_;D{8XLca#SD-?0YWH$t z`1})0F)p(Hj8;@TSw8}M>zu@<*rTky(Y-z=YbVg%<0OX^dz`g1+CAiC`2%OdPVyzi zo@DKep2;{_J3%wr`Qz&=Wt7c(Uc!7~?k8OruMmp41Z!{H%-ahdtaGw<#t-s#g2#KD zEdTg%o|}Yz&-1(kWrDK{_2F2_Pjy#*g#6W;`Hw_5^N*DA`~yy&f5gf24|MSS zBON^dz-gX;GmwyYffKU%Q$AXnZsOn33ln+|Tom?dSOiALaSS9_9H5 z&++_Y=Xm}hi{Lz}viuf3Yk89fMQ{;Z1Q*|=!8R~}dfxV3 zI%fMmeaiMB9TmYva1mVmkdE6vq8Ds`KqqW}NG01c9T&kxa1mTwrmVL6ac%oilY4+Y zKtHZ!Kd$BDz~jIkU=Q#F@C4A0YukU-1^@b5&kHWn3tcaCeZW3oFXDO;cM^CK*az$b zJ_dXY*b80nue#u0U+Y^^Yed1kvmbf-QIiLN1HgXh`k^}oJOvy84genqJ`U_hp8l`7 z;9p-G_;nWvKo@{+5I6`7ATEHo)4A z$wR;)U=X?>bZ3BPfJ49`;91~VU=VqN-*UmfzBcrhiwr|I4BZHD1UQViVZ@ySo&$~m zM}SWPp9Bs=H~fwZ{`Iwy_tgGuGw&WnzERZVG2j?*6#7x<&jZf`$ADwNr+`lZN0D#z zdoK9b*T#O|MaH2Uhi(Em0USr%IN~km|Dg;1^|gr=wO@tvepb`{ zYHjaja{xZCzvFp;)$)FJkLUA!rM3syJ(|z^mH2I*7YM86_~C@j>)CUGQrm~wy_7#6 zD7Af*-5dFHf>PVZ**$Ih=Y`_h&iYgM{Gio#)?VRrgjU;GJB7~^T5T_WuDDxmXYD=6 z^TxeuJIntd&m;G$?JWO;Jg?lVwzK^I8_zRZZU6uFT=SKR3c2dP=2v7QFSFm)aAkh+ z`C<&dI0wEs2UhSCx3b+PtVNe zV$I3q<>{uGO#10qBj z*IDgt(MVfcq%9Qg4tE?RfkX=H6ppxAhXR3KwHxk+=43iG70XC7pi57 zrfb;B$ty+nr&Rm*@vL6B&8YUbi|kia`(G8=Us3HpF0y}1wU0WLosEAVsrJ7sivLs9 z?y1`dRd)VHwf}R|R{OeYZ!C)cPsRROo_~)`O|x)H})7 z(&CfaH5==*W+@PLqMiF)yW&mFPD^ZY$POiJZQ0Hw;aG+(>)EkRIZ9C}9Li)8v7NM8 zuqa1a)3Sr^DobcCM*sLsS9v&!109qO>a zF9B^1ch_W+uvI=Yv&zz_jnksYrX2So+s=;Z4tjWxRWwrX`N89-Po3CO*8azYm47yM zQpz!~cwOcj+5}5&p@=G4cF))J+PqG~i>i*z>sVW;$jYY8_jZH*JfBZ%dTm~(A$~Gf zHqEc;RkJevV7WE_MRgviVX_<(5#RxQm`!Vc;|r=C zmEkMg{DlPj&G<0AHZR_oPZh5>)^tT3=uoxnJ1AR^igg^dd4o2u*2d4Ih8+Vko2H*p z_1gO58~FXLa4Tvazg-sTwe`ws`^E9aoP`(LSbJLgY`<_Tjt#+CY-92FtG}%3TbQU| z)AZd%$u+7Wn@0)-I&Js;eSFWV4lEmQ{BpT*dT8hW3$STp&lUXtkLx-9Mxhb=TDzM0 TW$bLH53v&y+dmaKi}n8vs$-Lg diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=False-STAGES=3.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=False-STAGES=3.co index 312e8cb4ba0e7685cf17477aa117f96870721829..b9240e57059ad304c39e53e1c685c67f345389aa 100755 GIT binary patch delta 72 zcmdmywIgf8QFe_)BV$8T!(>APV+*syBqK|66Y~@!Gn1sGWQ)XPQ!_K;G$TVpb4%li cjUt;FIbN$WnOID|pr<=IK#yg!yq+XG0607qVgLXD delta 72 zcmdmywIgf8QFaZBl*A-M(z)c^nh diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_down_loopn-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=128-N=4096-BLOCK_TILE_SIZE_M=16-BLOCK_TILE_SIZE_N=16-fp8_ptpc=True-BLOCK_N=1024-atomic_write=True-STAGES=3.co deleted file mode 100755 index 25a74d4649864049483ce5c28db5e71216064bd5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13944 zcmeHOU2Gf25ndk2BPBEXgOkXP9iJV=p)Kh|()yzjq>Q4pNh3FInxa1mdOaSGKM{FH z+#O}PMU{)J$U)*LwI2!x1=6BGniQ#$82v$lq7vIa7ij9221pC{p^tsYLyHzD!p_|7 zQWSNJ&8R@qmMa`*cXsC6pV{48?an=U|JT#O?+3mFKCP~TxheShps23Y{1!9kFr*4XD^%>RUAmc{LW971bdE_rv2K(hk>q)zp zd>-_$ymkYYh0ior*n{3pjl?KTk`Ty7At3<0_tflXPJfY*bFa&#Ox~2mdDYO=qIvO8 zWZ*SX&Xo;G(JPuoUKV8|XTG@dx+rHdhH9FUS(X*`hq8jPxXwp=XawD8Y%@KIs*M`BUd zauqqJzWQ)k)$W@U?z>WFNeilB7If_rN8S=k@}iV2%9eCSH|A*tu*d+6PE9VU?+!~} zO~gyODkV${@T81>Mw5!VUe-=8d_ls$(^-ecq{1)6vQb!&Ewx_Dtwxa7M6*6aj$CmQ zX1P!-(n<4*Xr3*l_2Ro?8zqYKnYYA>R>itSwjaCU|*)z>14N{_f_j{NaYl56bL@B3XG2CvWv;_1X~nX z@A~LU0}F>o!1Zjv5337Z*WWtm3XKBk_rnUu-V0RsapaS3!q#QB9wKGWeu&S)S`}V) zd8?~uU;e?sOVQuF6#ONwXZT>HP~H!ZXK17INgJ|YC?ZMuLRnHOg^V;lbt0~$<4Sxg zJ2er{rai0D$P{x0>*RDsPRA$G(-X0zJU)>~Oisrq;>q|#GLeiWm8_hRlbLjSe5z5A z#p(NMmZkeyWG6+@u|VQf*_G`HlzT612K?PiH9l~(R;!WAJTKG&kv^1vhUbU)5T9S+ zBEr*=o*{oPpZELuC7Y~nkiN&S`NKTlhcv?THJ-O^tSqMcD1Yx3zpobh4vv7n8h;6C zm=D`sp{`}_BENRabD=dT$98yj*K&=#;M6}%E^Ks>=Qn;u);3l+Zm4gKU*j&gvU~Zx z_RaRq%gclxy5$*u&4%)M@~;hIzxtk^>?JimWaA_a0k6jQUzSKoTE3imG;te zWTJkXJPKuw03{#13sW;@L45|-^GpByadByo55jK!ukVb{&yXb0NqvUwTel!@acLjq z-2=RnXJ*K!fZqMh8ImG@zB=8{aY*mEnG5cxIQu#tV+{p!Rr#QEJ;2fejhg#trqd22n@LVJ^^n9RaD0EXKZ%2CaweD_b%-egw z&K#FV8#!ac=gwK2+o{x*t0|cqoB=wca)YVVn^#j^`yu`2)fu2Q@{BVcV$heVb4N%V z=*;;eBpA5-ttW&_8}|1e_+u#6O;(V`yUBp>12$e<8y?uQ;R4uDpluaA+IpC_^>C}U zHrVh8#W~`Ea|gw_qZLkr4TmVskO$746z9%XI1M&DN^y>Q;M_%V?rMe8V8j2E-P*R7 zK&ItnK}LXMb3qU+w%4-B}Mk_G1nIe|7#}rso@-{b7B6 zb@qpy{o(b_5!-%$b@qq#`So$^4+VPOdwAPI}{fO?@*d?ZlgGOhvJDt?ISvOC@v1(p)}(hq&Rqo;)z4;BRY2|E)L$I zG~*niICzKRi9_upI`)a&_`S)8*vAL`hW90Ypj-DPtIqu4eeV3>{oRCz`C}6s=Fn4c z-)487P9Jo}!2kIeU_OtFG_r@d0ev?xFn7*q$W4PQgcBAi{o4mM9ixAs`siBV1S-#x zbqdaof`VRT0(EE-5(zOT#=s9ph~_P;Y@Qq&gG9cP78SiT_K2KObz6`GryrPUCkV?|xdsfJXRtvq_FNv1AYdGKU2$|X}$WKA+F z<+5&AAfe35xq_C1d~je>)0Lc5(6YJ&USpCB4rsb5K|QmDqH2zrhO({ZPNI$K);rKU z0@v&ipwpQ@FBNm`86qo9{M)I#-NY9i{`<9As?Uf*`7F)oWy(Lm7?l2=@|)PC&#CD9 zwPyaQ7X0NF{Avrn_}_2A9|p$}!i7nd`bD$${S@TA{PC^< z-^(Ab7jV4%@rbDDmP$l$Is|t{BIdM;n3v5wp})wt3?dqOMz&-kTI!Q&^#35##~CICkhs5 z6f~lMZ(~6NkK?RfEb3>lBq^PiG+lD$QG8;uEzdr!Qj*Z8vTY^wm~K}ots3YR`w=vq z#$}|mY#0T#y>c78S{quBjNZrR3Yvh5?6OuTEW|v(WFgZGA&k zDvq0Q+nV6UUI1+i*5G21pucRn;`kZY2TBvq83#?gb}igiEwAH?D$Rc7bN8Nl@IJOS z?1tj*lfR#~yFH;8U;=6(Av-dxvU#D{T{>QS=Ef53S7xY5>@3orF z^x6BLQS|l1!19^C%XMEyX8`fUGHfA7-oYU@`y!))K6PRiiFih+G^h5bDM59@z6 n{?7xC^*u!EA@6RW$cuH38)4-#_C<(z>HBbCK^Sk!Y}WrbLcbai diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token-dyn=False.co new file mode 100755 index 0000000000000000000000000000000000000000..9790f6f3773e24ead55e4df0a19f27cb3f4e85ab GIT binary patch literal 17824 zcmeHPeQX=Yl^>F;CAp+XnxZJ0rlb|UqGej5z9mX_VoA1JS5aD>lQVLL5r&c|EgOA- zq8vBysVj*#Pn9qy2#2T2JKput@VxV-=oE;;xL^w0j$9Im*( z%zJNVhFVd!L)U*^)>!>^-_HEz?VC5VGjBNl`iYS*m`$bufzZQ3?hq5ZMi^k%&kfRx zkqVT>i5>p#A#Ngq81oL$$Gh-<85u_#ONU}!CKK&|O_q=&@zBSZ4^$wa06q4WU<4j_mNxW~xl2h}U#Q5CeYAA7w^bccm zlamXnEct$HA^l>?_Ezj%a(Xe9cs@Nl@nf1KF}654naX?yf$dQVJda8ccvJ%UQ3;wJ zm7wiW3F41R0PgY`ez3}2<|Iuj7+vO#v9Bd`2P&o-c~Gif#HKQHi}Q(v)YMFBHk+7C zpHEFBP=OCL^3S?UgYy<2C#Ta>v$lWQCC3b-Z^ovRvr~)7sni=^nNQ6g8|8b6J_kUW=~%*;UzpeYA_%(|2;7(bQ$1>8-I&$a;I34KjV%hcrgzOD}5^>E{w zSbqlH(>)LDH@z0<#n|`bP7_Vm46w%{5{-4p15`}Tpd3C9yx-^jE+RjJEGX(%JrGM`wF?`*?d>XS^pl)&Z=&6CFKWV`Jm-aPv>MudwXwR z=e;;JDja3cH1^%SSJHYAe#`Vzk^=+s@8!MHJ&a5gRN_nOlk)GD=oE0q@D z11G_BfqdnfR2kA7TJ1`Kyu2=2%@*hDj;oHBts4i#+{UV4mMe(&_9{5E$zfJk%w~mX zqaT|{utK9Kl4bqgs!=qp2p-u^Jhzu2&n2%{u~m806^}>HXCpyr0P+DkFh8JklcLR1 z~4oSV@bSir)o$3{rOWEUgsaN1V_gKv8 zDhbP4SW)vzM9~yQ_BJTdyrM*MilS6CDyr9_EZqu6)O@5#(IOGKsv#21MKG>xCEKN&=a9Di~>Mb9JG~uwkr$LV9 zWz~1yK;B%(P9ztLsnxB~Xg(T^W_>+u9Az-Ueyjg8U}3r!^eShr^LvIIQ^hhgJWwxUnB}DS$3$ z@2f&{WV1P@=3`xo7K?#CJ+jslS8MwEqxt^+NUpzM2_EM9m`SduN3H3NMf0&(1awn^ zty~`qDKs}z8#Tuit+`qD_jN_{U0sn}SC>-L&2_R6e{XX%-`pI@H8(3YEuhnixDhVu zWYcwewQQYi575a@eg-<_AE=Lgw?3=lhEmi=()B5TKBHLwW!Xu3fKCqGPV>n1K)oEh z^uoHT;cXUd?{AH%P-mY}cl~OurGII6y~WfTOKd4fs%Reob>#1ER-xX&Uf>IQ{l%mC zcs!De$CY4TTn$;`OUAkjwlu4LOY@ROzOn8gSJoZ5$~EFSd#6ISIT1yLqP2;lsoCbt z30=xQ=&N3X4mQ}^r8ZT%HB1|9v%3%S>n^)H%CF;g_aT0L#O@{oTffx^378m~8Cn<$ z3@aFl46O`p4DAdhh7N{KhLsFm4BZSp4EHefGOS|gV_415&#;DJfZ<+-L5BMnh8Wf| ztYcWuP-fV`FwAg2Lxo`@!wACz44W7pWEf?5h+#9s7KSl~tqj{3wlnNt*vYVqVK>7b zhP@2q4Eq@NGd#@j2*bx1KF;tc!-4}oNzit`YD$OV4z_1SuWpmgb<_u;`f}8m*Kp- zRSQE3V$<=Kt7O*V1pAg{w>1D5A1AJyVEMVCnSDkQwcDY0FVEW5L0_pgFe8K7{EnX9OA3Ut1ASSCD5FOn>b)p$XGp`>uexgZz9mroZ+NLK~*P z@veaH0`d!8nEu9ZggBVD>xF#&ac_7$r0sp~5O>U#P*)%DEv?+U2nv)8GPr*6Jm zEDL=J^*nu(>Urko@AR_Jw@}|_Z&H2d-!TiQXZ9Ux-}CQK-7dUC?K=NH)hYWv)#?2E zRHqB?XNqN^kD^WIZ&UrUw_nrCLZ3yOp1)1?yl|W9IluL;fVyS3sBY)Cs0}Y{QJpgH zQJv1cM|FDfy>PKC^nJA5(tA|D%m+hyS=biPcIQ5zdcODp)pO|sYP-ybRJU^PkuwoVr>fR_{C4CeoLQxD(wCqc=-3g!`l#Tb4=c4FUo0~WAga6 zoVGb8KVtWA|9R9lx9oOn`wJ@@`q-9@SG_P^t-|r@N_edhq~ltB{OuXNV=~#aLWqum zS6~cFKvq!!vN}2*UV-r}%0v-lUOF~jfw3*htX7-mdfBlSc*(iua5}VQzw;9OTXQZu zuR2$#%m-uVY8X4`1h1Ua$K+>W?CtanXgc}|Uaty6JebJn_W>pkQT@NMPBaLu4t(@m z7)k~z0b#fpuv@%Rr6nlY8vxsbQo$RN4*%%w4cB033FkbYhC*i733~0S3Q3i+evl#labYv zCzZx8omA~`HiFG;pi2bM3D{^EhRF1{KT=$TL@Xmxn}%o`hF-e`xj;PcWssdw0e>jAI(d@|bJ1h#+4=TjQ1 zed?+ZisV9did64Y8~Jz{>QQ>ar>+PivNkfJg6*TLAxsxiE{%*Rt|vpPYhO^sKBp48 zbL@jEys!WDSLG|v?k;>R$syu2-Cf7_;(>RU0DXXVhgXsuKE3@(?Y_@!e^RYal1kd& z9>!N~PJqeSv*0nMTZl7&MArDNB6U9A;5j! zNqs(mvrPClfL&jL@s?QSRKXszY?bLl-c>@J_rST~)Fuh+;2 zJ}>r`&v(DNL7$H^{v(W^`@C*f@OiPX{Bn$T?PI);8TAIA7Y~=uXS8cS<9po52R<)8 zRz9E6u7?@lQ6nGtym%D(T)H1w2VK;UTrRWq1iiL8T)NL0eO_dJiIw#w1rPUm*3Wx9 z)aT&`2QPWd3XYjf^>996zOT=ln9pxnR?1nMfVBvxY-MrHi!lFz`cw6}5v(PMBxu7m z20+mQF@Jr21ou{0V?a5eb;nvm*=q~FNY3Zi*A^VOwg6*72fW)V@ynV&pymTXMGFLE zXLTT&4+J8)KtQS78&I8=z|t+hU(NgBx!*54s(sPC54n6wC5#m%i?7Ue0DnaD2i1Hq zq-en)#NF#;txi@e8@A*1E{N9`*8%G4)XMr`G#?B`a>1a2W6Rgp74J=X`;0YV|V4gh0jEfkVn2e;Q_+>Kl(3vnF?Me}UT?5+i!cC7=1 zBCnRM6RZQ=S0`8p0G;v=)W^PCpIz$!p-2Jr(boaWvJj>dMlP@0A%1-s_F`hf*|t`-K$c?VlMc#-ow<@tISCZX1Y4V_jg3d}xNoO204z+^lZ!9Y(LwIPBBc zDEBkhDEHZGlza3BjSWA2gX;Rs4a#%&2G#M@b#t*StYg&k^mWR0<~rDcwKY8!ew}ij zx=A@t-=v&pZc@%?Z)PYL^Nro@)4|)PgV#Ov6((2gMLG2qCU4r7Q(s~7X1n9h^_6A2 zU0Zxm+|_%JxT|Y-+|~862=P?wnrIavo@%`W|JJO_)~nVPD)WI4R%=#>r@G`E#$h3D zY9*o*4KVm#Gqalu^GD_QpfLhPa7dcfS) z4RIfRZtGeK!5qvAfg|L@cq;N?JhjNT5BMPVP%dAsfe+)U$cORNB3~`=LF}PizB&UR z##50Gd;UmH#DRy4-7%xYK*N7>2=Wnb%8NDjHfyPF`f!zJ;`D5OVWP8PQO&}4e4X=hQT2^_8uBC zH=Ll?4b?-EX%>5o1d)>G4z{6e2c=r&{3MF`g;_Vm#I25GBE3lPr4y589-H_<;G5)g^&` zl3)?_I=4U_As@z5kq_gkMZSH&cc^?mtaHwX@l@o)cxsWa7Wi7q=fgVZd>BtfK8&Xp z`RakMwR}FTbL4ZfI}gkF`in~ zdq41Xmd|IjE5=iiug=KV2z=e;^BL`m@l@o)cxq9<1Hjjdd=H4HS|#1*(2uxX1!mU@ zX4is~w-Xm_Cr&4ACj!i~on{5wl4%c|)pVY{)93M9bKzqX_`C={uMrJ?%eA}#3w@uT zpWEPbANbq{?&$-3&(Ixq;UM(DDI^{h6v(T%;Rg=r==I|q!~eNkn8V8f_8ht@{ZQFG zyWIxZEF_QP_6V8%RA27sh_%Jq;CJM5lbX%WFZ8#z!i~B(78{?NX+4#kNX^aa+bHH{ z<`=W6*6HbUGcEI(xv!?ivvOu`yrnk=401I~H$iOSboQ_4IUf^o_;4di#>S z@rhJ>Uwc>Q#CT`tcxQ5~Z>;+u8A;E=KBayc6&e{i&eV{bTBqk`r&5{3d@`%z7L3Hg zTq3K&7LCmO%tB&3Ih$BmoS&b|WFf`)^U107>=ZmtB_>j1i&Kg8?BrYmHbf+nu(e`t zApv?$rl(U2tqYm)9h&Pm525b11@C#K?;XHhS$sbnOGWF`N^X_N>-6VV@|_ZSjLEYd zJBxXo$v-Md|79i@Fuq_op4G3dMB8T&!sRTJzh07liOHQM@--%Jf*-~ikH2N|TP5j# z!sMGJ>DQUOtwjE7Cja*a{i68iznJ`FV^O4Ee9Gh>m!yaJ6C76Z(V(87_y@*`y4=}S z6zLbRmZ-~LD%lhP>n*zcR!RC}Ozta@KgHy2-I#Ctm|*fZO44VU{8uIEFEV+&M1Gyg zzX`u(G#+m<`DVXSO2{vn{5vJ-|5ca&v7qftP6|vuP$KWd-zbqk79g~=)3nW)3>Si3#WiNo;IkfyC%`(Af9_&B1PQ-xut5OV2FMKpN=TahDyzU1!z>d>Y zqP@GjGXeKIo_83YceZsV+Do2ymOO{_JMwpzJcsl3?X8}9y8D9rKv z18iK+5tfAXgFol_nOGl=A~%1|=LH-~^K<<<)S5&LwI@maDv|e9)J^N zJpTYwkfR3AjK_fS2CiG-#Qc1oR2*OP|9_&TBtwY;IGn<~`Y-{L^6z*L-oSD?%EG_?02IJy0~|K~x*ZBKKs fzegzE*dg5guGirCz4G5+`QPJ?&Um=qoc8|!uy5n( literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co deleted file mode 100755 index bb21deb0997861220cb75510125be0f8c87d8e1e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18136 zcmeHPdu$uWnIDlWa=D^NilQirrsNg9TFbOdQ7`LZCy`}2RaK|wx;fWpxM3)X(z4MD z6y>-%S6xZ883&FOr*&Z)H7=J+o8AR|-t~~;aFWtPiUw%@aEky@+Z=)d2OJJ4aP2)% z9B}4+-_8uZqWqxG`R8z1gZk}$JM)|0&Nq*pj~svb$kE3wWo7*Wp@)@xNXpnbLO(md zIzUg3mZPnQIN)y;aT6PuF>gP;y$%1s$T*5D9h!NW%(MmiSwfD)LvLd~kU&5HJ?e@u z%C0$@@}XAJ&F+70E6%dZ0&h{iS(a~hf+pn1vlZ%$<+H$zQp)4y@_IS`Pnj{JUP{!- z%l!=GGs@$FN<=IRKRqb;4)`>45=VAP?2z`8a3KR8dSdVs$Nrd*#K)Y;@% z99iGh$=@0+653wjc49I$F)jXZn;kO@{bh7AF+DM#m`Hx>k7kq8hmXpKzh5xL&m_}x zshR1QE#%GURN`!Wd@_-VKRc6th6%tT1!%+5iK*mfbNp0W_tZ=>-aeNBzW78UlboMD zcILBj{_oiMXJheH>hGen>C_pxTabAB3CJ7Kxk4w3exF{<&88+NX}=<`N9VpcH8L}~ zxwq8fqR))I8J(X_jnAZ~4Ds>X>dXE*IzN}3i;sN~9w9Xv2j!#B{1|*w(D1X3@6#)@ zi0?+9O`J)Nr)Qw{u^Rio%|?R^RD3e?BeH&*eJ?YI+6pDX(=$wEc&`D%Nr|63Nl%(h`Dw%9iCnRGy73b)!e0(cE5V*d*VX*4fd~ z;n)4EdB6YVW02P3m)7@_?7HP{_p^c3Kt5pP3+RE>d>}yc1+MDLfxv~Y!nGw^w_2C4 zGt$)Qb*uThI+~{LnqIe5w;HGm=>7ndy69iE+>P%Z9Lsfe7rqLdmTYh}m=7A|1ohx* zJ{Y9s1h4DCrQm8^uueD1TC7_M)LqrBWNF<3W%}XT0@wbfx|N)jY+$$jsnxOpK#aP{ zc~(4bX}ye`UngX7-G9-&;&eOpO1*OFCzuX87p7Y$g1Rc$)T|)L7l>%ou~05A1aeg? zQkA4v=oL$Oa(T^Ww^-d*-K*{wZ0o241MPtJS@)&&ydHjvzMj^))-Wu~vSmVI@a=YS!mhE=iJ7Q6XuUKv#uB z)Ru`~28U_6Kv2;G0ohp-h~xqRH5&*hm30BlX$>sA;`eJgKiv2G<%$|#BtU1V^ujR?BaP1+rx^2A?+M7;ZTU)jGRox%da>0SU?dj|s@Y&raqS4gy>9|<8KHWB2lX_{ zk(?~6Sy@(GjWVQp9rBlvik<#Q&hJ;Ve!t?X^=q|@w)I`0B@bG#ZdNTJbtBZEUwyqTPNQRy~=)tk0w0e^6p{c^Cr`h|~Q^hu2=qoqxFHrZcoediJ>^{DW zXf@Wz!gjwkXl`r6f|pbq{qF+!$ki0mz;94r&=w5;HAiyI&1$x}S@G^^)_m6H1+(wG z^&!n=4K3(ob(tq{-Y56c5~KcWz_`?D5BsfKajWf;7@yA@5T3+i~!D~rJ< zey)f?m7h0@!QK44MGTVu&0pw*1j-m%7+M($49gkX7}^<%3>^$5h7}B*3@aJB7`hpH z7*;X#GOT9kV_3t`&#;zZfMFfOAj2IDLk#N~HZa`DFz+Ye}<#2T9C82wfOBi1~DKeJ%Kkw0;r$Skd!Q-dme)LWYR`Z@=hCSOnyT z>!Si(A53@!5wJfI7O*TAA;+9>J(y@0JeWQ;AYj>Gf6|BPQ^$n>rcaFss0HjNLzq65 z7LdO$3H+c9?5C0c(In*`NQMRejL83ZvRyFhg4I2i)i*UDU>T229T&JSu&jZp5dq71 ze2SKF?6O5b?k6u(?vt0p0&pW1g>oLhK{-#{pq!^}P)$$WNKH!y_By~lUMcVLNeBMw z&2{OJZvo#^fA8~Iq$BiPa{DAljaPD5T?--7dDFg=@|7{Zax>o!;M-d|U%iRX!uV`v zzIx!>S2|yViO}I~5z}H$j-%iFSk>B179eA8n05&}KIJsbRDGr!7UV!n-UhUF! zpq^%?_VNsCyB#gaMs^jgA3{kn8Wu@Hgxj zlo}fcq;T7ylpnZpaE(urtOJHG8o?J5@Hv350bQhhLI*0e1MMyl=qp zZR4Q<>f3<?c<}9`M}t_XRkA-s4dkt32AWg}!@>wH~dJzjuHi#ls$L$ucPGgM%8>ePo%v zS6>($RG^P&QngpZu>(TSAskD}U01*K1^GJkyK}!5LlOy=-CD!`=YeMz0eyh>3YR2S zc#Qrh?(y8W|A{poNi6DrRgAC3%m@8X+*>-Ix&L_?pWn;}{ZHIiI-j}!RWrU?GavLn zv9)wQ-v6A;=asDgVSNS|e~9rLeaQ(i2&~seD5tITa?Ime9pkMx>)r)?9i{V`>ov&u z8q9o6z}HnepSfOlFut8;zTLprjeL^Pm&y-Gv@c1LrThpzFLz7O<|L^c+R@$WHSc8W zRmnyBiPK5@3DoQJPEovZRe=)BiD9oMp9W1U`w>vV_KWNH@fd}=p#Ui;_OwGn(ioY#ZSo>GC-i|5w zA*Qf>?cbtqv_!RBv{TWeQP8tr*7x^lwY`0jTwkA>?dwy52e=+s`|sbc)pkcCxoA|) zMx#owmFt1Ee@hFkqn4b}F@9T$7dfyIUf;mKHVJ(xTMv15Ml3 z{w?ZDC2NAU|6Mh~+8;FK?y1MIUC*|)e~X$2J;w;F{Y$DRSo_~q6RiC~(=2k`Q&+_{ zU8|NBwU+l;sC$2FR0E&g$9H{Ny|r&)yPl|4YmF`h`5X*B@^`gp;5Vo*XbXn_dLp@= z9yQz3qXc_z81}IZGpA_g*6+=t=Y07UHi*+2)}%y1V&g<%sz zmEmrN%?$T2j4<5Gu!Z41hEayC3}X!27`8L)VA#p9i{XBT-3)se_A=~ac!1$ShL13O zl;L9x^ER6i3*qhUSr!ZN!VGGxzoG4MTQ$Ttcst}aAAXSlZ4=uhw|OA0fi`TPhOK{X zvn~cP6W;EiZE|}}C&Wy+9oi|k2jB|{bDQP%s)GR$G{;H zFz_hpzqJX={$pgwz$eKu15c8Z2A)P9om^aVKn&$q7(*$l7h2A(&00d9#u&%3MTi+d zS#UjO#32@09AYsHu>`}e8S#ik7LT|%AfOgVKZ)^;Coi(N#KjQ-wfPB|HDVGM(}Ilr zmzM<83igbG=`X)5Gy|Tz{HlQ3!G5*{u1{b7fe^#=*M2PEvw;0vC#JvlbD;;*U;9lV z=5i3zU;Dja)Fs4R3?GB-_<0&jIdR@=#6ilz=NJz-X$iRMWY)sHWL>sHV(2#|vd)o5YyW*>|YEbMK^$vapR}-Ok>mIx{z^&a*dP z6;Ru`n^fEE=8px`l-Z=3&Tdjo=QgRP^xKv~S=iRm|7YH&`ks3`Y?OsGB0 z!uan*|3CKu)xPin^E>d+KF88^ica3QXrE*Dp4)cX=a~JV==`Yq+@iyw@48nkDtL!j zRA_rFD&IighmG&KAULh@8qZ9-P`BE*c6~I?}at91;#dd9bonljagkVVpbTd!kAShAjYgPURCJ` zN{&XrwxE>vhK#k2>rjZUbwVMF>j*t}RfnX?`k+*4^)AQ|H*i$KPs4x*V^+wAF{=XK zF5v4Zov+cvhcPSU!FGT@a+b^?$Y^oF+LZpmF2^aLu~%H zH^ZO6n3Y$lWHBqp4ll&4hP1_@VY$gZ99bS7RzBLT3NL>vh2llUgd(tqByKz+hNf#W(8lP%Qyy=+i?B3Y!Gql}pw}z&26J+q{O49EZHrM_#YRafF^b>|P1QSa9rcMF=mCl7_%zq-V1yO zO6N1zE5@vluffc>5BMG_ozGma7_&k?j9C@*wF2K`$ajyJm0dF09JV7~uL7&ra#pW- zKkp}A+E3v321Y*-pr80f#ep#^=pWc8N@?>l)2g7>O3+woZu6ySHT$NK@im$G3T?hL zKC^Gaj88T57214hd}iP5Vtl*Je1$e&8sFV~gKfSUcyaD+Vf;p$2j4gI3br;%vX}Znz8ex1Cph3m-Z;_>|KnDE22cOM^24-j9PoqUJ0GxF zN*=}iB{JLZAh)+iW6>D=j$dxpGMU-AzSdT_(B?;?qcc;j#}i}8nQ3Dm%FNX4d?wjC zIeBJk-)ws33(3)poSqro*Bu20xqW}EtyAtE>50V>ZM~iQ_qVtAj`VbP_a?e~#*(ld zptECaw4-CRBQerD(zS;iO-;i-tv(qA9X&e8WXR2}lQYv3$#i@+kEiFj&yd?pUNO5zFFoiZ~Q2c6@o z$>dz?TzYhia^vCw>n1T{}QvGg@C*Hc#+xvoeD18y20#D2;7^GZ!^1I z#Pj#e{`I2t?=riu$o>JdcQzO&g+IS#_ScHi|2MOf#zIo#1mX&C*h$N-t#()|8FsNK zJ-kmE_6&X>Vmb~n`@d2L74Ce7^Xw~_jFS;&KUc()X7;N^>6e+^Q)K@Nv&W)a`Cnu9 zSBldA4YLonZ%zMGW`DD5tNoYEezYk4zZmx4<@K$}NeE-9+)4-LGs*^=p4<#E^)IRY%fer%}+rZm$#`2^cW*pK;v*yR>6y4On77B;=>6$2etQ&l z?4_n*qwV<2_^apK0k@ja{v0Mn@kohq1wTZZw_9jB2H4g+T##&>9k5#t)VaHp{0VYX;ubCMOAQx}BXj_9+)yLy^t!caiOZoLRcQ`|N0q5B|xg4?Xe8 z!&^u0|J@Z%pZE%HOlN?_|G&Z0I8T_4qDvz$zB}^#e0{+2X_gn)2Ur%&IL5)(4X{sy z{AWHt!}Ig?0Y^Sh^ZPtMqcOHOBR9VkF^u^c5A$hS48F+rm zcKdH}+63Gxwqfd0^Pge);U`2}4xS(4i(3wU$??l@m;2tF@O$tZBRq80(6GMo6_{5i qZ~r;4@w$i8!V!Cfo?a_h=m{^En=ix3o$|lK^8buCbe5qw|Nj8Z`UcAY diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co new file mode 100755 index 0000000000000000000000000000000000000000..bb3079a0225819600a8297343cb2625039a04f3c GIT binary patch literal 26184 zcmeHwYfxK9+GZnkNR$;{--WHPpsOvSYu+sRa_CbjFTN%AMIooqRo%KrOmW-_%~vsGV3 z`@Ve<2iahG^K&DYcuv2k`|0kd`@DTFua2MV-Tk00CnumM?2}7wksQ8_5a8S90k*N* zfVOs$2Y-u65ix=p%Ldr)+wh;9OrV+PL$j!po0h?Ko>3sNu-#Y=nn0id>{FS6dw5!) zrX04FJjVCWEXN2>5{YkQl#B3kk>hN?Kv68M884@U9of_;>J_@|`zAI-b;+`wqTYw0 z9Ia0@C`-h;@Y{|Jz5~9_2T34L5)av(JRYuDuP z@KiWPe(9PTJsLLu#x)ZfpAH8PjZO^xmSqVJOb-u-qjw=Ne_sO2_a&(Kz6A33C207* z1n%!k(Efc10(T`iCZ^1GmQ}BvGH(n#6QmO;oon}MxvscIqLb5+;8b|zaCjmX93Gtw z4+U`|-ndSFPvH}h@Whtg@|H_!Q*b66 zof@5-IHx1;yAFqDgTv#YSn$YX^bl_VR;h=BU?OxleA5|x($jW$G92_y#egq35{iYV zBYiXbgG>MVhWERJqoe=X6^V|{!2Wb2m-;|{M&g{Oi;&pg;no(fKQz?Z?)MG1 z4|d$CF&)24_r>}8d8e$+V07gEjwC=pv0mOO-@Vx4ImYWWzs^WcHR6D@3Ax7W6|f@?C}Re_ z@p=QM0o|B37_YZu8ZZDD$kAQLDRi9ZU<$;q~nVHbH(Z*(N| zX1?Ds{~`JLRYIP>RuVT}%Oi{Gejc|it~>olBe}kgbJ~(sr>m^ewH)$O;90n4J83&z zXse1B#j6)j8=;N7j>;1?rweLo;%nmfEvCr%t45P9*Y+~-K5x3VHa~Igm|iCvi1pKB zAVf~Sq%KO5LgIaYHjpY~oS}&<(6l+u(2z$gpDsZD<5sJZQ=(HBbvjxmhg3;2lmW~S zFf8b2xCXhvUU;5i(RGHFTMTn@ORVZ5@DWJok*eYVLu;I2$#sT#k)l)AQUwK$R4PSM zg@p_)7KV~UaoxYzPD?g>JU?Csef~9!iw@@4>sRy0FRvO-IZh`@gCo)4R8AEXC}nF3 z)Ki6pN?B2%ddgx^%8D)ODbSz{v^Ykba@^UZCYxMJys1epZ)|cVo0=SnrY2==bCX)0 z+cbaP=~9y}w-R@`*|Yorz7t4s!6Y3iF>_rLf{M)Boob5w z_9{BBL~E;B+2(a7ys4$%FYJ38cyq{kr(aF_yOp@#FWWjcI+GhWIuaWW^CuwSHYyzcB)lidL&-rX(NbZmAeH*a<%Hg8tsEt?_F zB`B*SHErF_WOuhC(cP`c4|Ie5GL+3FOIkpA)S{!^nQU)&B-+~*xuYHOy#wWQNm-lI znRGfG38zz$?{}&d3&v|(L5CFRfS&n9eXAqY8c^e?YhbGq4+KDu2j%#K+f{kTPG@rG zPDf(rPQ|`U=%FWx2Om`BZ2@O85O5>{0ma@U^e_b*m!x3SCTO-L_U|vbEI#W2?w*pv$81 z+UB$_W~$2>=u38*m{-vxk6hI7+*yyjjQ$8cB&tW>UGXCoD9>=uv zq#!*{&S=`Q`aCIEMUyqOUl%o8ch+T1M!$qEg|z&iRxB^(Nnu8Lp-U0fqCRu$&0nrww*kPuuihaj+Rmt5xubsR0wpMjrZtHxUymHk>{?k=k!hV`q4&E|0~(R>Yj2n8n#FdvROb@g0jX|-*xoADxyAN&sm>#| zcT06%vE3)t`NZ}kQXL80{5(!bMGi+D$6Stjjs}iKjwX&~j(Hpp3=X zY~<+Vct6J`j?Elh95--ubM$cZa`bWZb8O+*%CU`OJI4-=ogBM3c5{4y<3^5~I0iWO zaNNvs3&*V-dpSPHaT~|&9CvWs$#ECQ-5mFDe2C*-j(r>-=D3gJevXfDe3av393SU+ zfa4P!pXB%y$AcW7=J+FyL5}?#LmUS<4ssmg80I+4afGAFag^g2$3q;)IUeRX!Eusf zgyWAnMmbJ#jB%XiIK%M>$61cga6HQKS&nlYf5LH|<8vHSHd`Eye-&VgW;Y>PKPc=~eS~OzqOg}eLWtHsit8IfzPnjZ$N^%ec#v2q z_LB;VLnQFsO@$DZY^OL*`Y1-o0g5rwPjQy06z52UVw}uUOkh26^4it9^U}3rIdBe` zF)pkf-`$i6IYHw8_*}iD(_>p8eM%4M@k>^{88C9muE(~4{Y(L*W0$;o3*hXf&3bGr z*#D&z(sP&g=_>%^mj?9McDNpUxeC&WOHsWHs9ssrV_(4jssicpE9dkLfRQT~^w>AB zpKpS6?8;@m8*ui@M|$il*kAWUdhW_+`gXwhm9O>Kcd)}=B7n1>d}I)FJqGEyPd+ou0>(f2 z+As&0_~csyYCiU!PA|CdSU}w;-ebB)-?Qsc@0s_!pg&vNQTLA5o7(&g*K3LSF))OBvt)m}+*1(0w7HU270n_^E2OsHCTxW1Ru@o%q;F`-sOR>Ky&pZa{r!K4$tJ{dlthb)ElsAJdB;v#9an$4ukn zA2W@|ZY&yZKTq+C^AtboK5^rm0kw|aU|P@IU|Ju&@ll!|kA2j8{sz-~@dnfT_>FH3 z&34Y$E%{OVmy2`tsQ0VIC3K#jFRr6{zh3NPeTVj<&p^%JDBeOff2(*uo#Pja2lXz& z7m)m@|GULgbe_Lld5miQYUR)9{61g#TAB}i4eI`S+P_}wqjUR>+ARjR;M*?waU9;N z-B0KD#o9qSzu&E$qVxOZ#$#zd^lhm1tBpUSTAy!xjn3=W8{aW_1mAASk8}Ku#y=QP z>$e&|r*r#a<6r39ez)--X+HFcsP!vlOzU5iF|99@)#<&0uTS#hJpWahkNTB2%b3=0 zmocp`mkp-%Ltlzo|GtcA{Yo{{`WMwd)B6P9Ba*)X@?5BXO^;gts+wv2X7wNR;y4z* z*AM&OuKt|*mCMzCq5cni@zr6U#=fC2w#2>`RkefuQVSs2;Yv{|Du^`f1&v`gYfm^$lv@9^(}8U z|G^-9KkEN>Gt>Wa^Ixd{gEeDr4*8q^LH!=2(`&*)U0wXy?-sAodqw>IPbu90DZ~3e zi;mMNJG)Or?{!)FZW#=D@sz^uDJ{Z1Ap$m|5o`{2e`yi!524Lu0-KfHYg&YRMQAgd z^Wy8Cx1Fv$S$o=6Yl|;b)E=)*)}F3isC}_^k=e@OUesE+7nRUk8hE}Yp3lwt_sUi1Imrur@j2YOsVD}7+cOn;`Bteo z-!A1f0`9O&DXSuNz4`t%YmYLI_cXs%6rHt~ZCgteskq)Q73W&#{lx!iUJK)c8{d=< z?pH4H-4A@bvgd2j@aZ_8QOnm1e0#FzYt``Oaz2xmZv*h{&7QA~^I6Hied<57NJ;=( zCl^W5XjO{gdh$OwK#Mx78aGGOg-BHHFh!loXw;F2MwNv~#BnSVQ96DcQS;#20ot3P zEuDY`fF1c!cmn;(7wuL;I`d;vhbJm^J{*%$(N)G`N|8G270H?#rDM?nV_^lpJm4z< zTmxv$SEQ1Bhh+5u?srHjB}&J(qbJJ7HX7A+^s?=a`lwVg<&a8pm3bTsE9a9qU!j(- z1^6D#o)5=D@a1#9A}wDV@I9VAAC3j(%NUD%3RcB9(X&;N}I(yl6XMId6nZ8|nE9WcM@;v~2PiN1k)xL!Dt<~~v0={7O zd@E^x!W>f z)YUU1b%$o9)XXZ{A5o-ks{IVrzKd&L$@^BV?b{aU+aUIBhH4M|O#7J`T~{yL?y8@W zN{-yFy^ZtLX!&}9FPuG}R{M3F?>;TxHsBjUK8d!^&?B+-Ns`Xc%eD>m5*)*lWPsyx zCGAg{C3Qi{&>o&~9E0Z*UGRJ&55B2VD|_}~ov~oN%gd#uGHP!C?N5}JDqUrzYErK_ z5{g5S)|aYX;=UY=qjaQHJ*JLgCFaKmrz5wgZ+@B?X4YB8ZcQsWm>Mv#WmzT>Y%F7jLZK;a&VI3!C%hmjP#hFyF z9g32K{noW$t1GAc`K$2bx+a6R5}*Q@x5g?-Yf9<5ChaJNbzcGHdp_;DX6JkkEg!6F(w^-3wCh?O=UcDkgLO^Xn?0Xc z*EVurx{0rAsAmJ`Z{qwoU)nbkLUr{(Jr8GJkM?|_k@Gfd+wB9s{n_(rb#-z+mzJ*u z_#VxkPpj+woNt4cuMPMf&z^5JUEQ4D&-qc;4xwuY)bm94^=NhVaNZVeySsqzsqFc* zx_UWZtCsHp;CniIKCP}k&ex{p+XQ^U?DSf#1ZvYlhWmN!|8~y5oAXmYUjWxLsH+F+c{uxe@VF7}-obhIXxr@rzWv$rX?5Mn`5w~p zwE*9v+4E_2-NpI#YWdoL@A2&UR@1eQ^FPA*sh=0Rc0fH(WM7X~*M~XpquO?N0pC;E z^J#V6$N3)9@;v~2PiN1k)pbATdtA%63HXB9^R1@q0nUGr^HV=BbnSt9LfO}&)%6L^ z`?R**TYzscdp@nMPjbE=Y597AFPuG}R@bLEUr@`p4fsZouaNpZQ%@oDdxeEMQ!m># zttkW_Styyn?+IOv+z*(zA4pYYoS#=!uX z{Cpu>Git2z3AmqSDk)S=>*0H(aQXSX#?PBL2l}Oc=%)#A2>V-HAeo8_sh>CPD1`o6 zX@B#2tgOFQt1hpXZR5E=)J6NN9p4b=3v2nn&zp8-&!-*V0nRt9c6WcUOaGXYYyANyI z4SwFVKl^rTbsgh;6Iwp-^QK3$=hNzXi1STq`M}Sc9?za{HC-dXi@L@Dg{~b!*Yx@P zz3cj8-tK8_yTQ+!p31)6T3w@@Z$`@pe%|zS_Iz4hr#RmcEg$%KQ!snJ)pVT&Uet9C zQ0UqzbWNYn-@C5Q@OJ-1+ivjlrorsnt=07?=L1KywN3QDUWgY`-;pg)*Kc7cGUn(hhugI_-zDK0xe%`Wl zK5r0yUdNt~q@)7o=XE;y1jP6nb9HLYdiWmEE%)=%5=Rc__^-yrY+{QICLVY8Q+URb zQ?x9W*epZLFT@Pz1sF>G40CcJR+m1LfwYOo8Uvb^#Re~n1+Icv;OZ4(fh!z|3LA|D zUW2i~aBpo5XmyO(MFG`w`%^p&88-8u-|5r z*Hl(GlNHESp%hhCs0Fzd>A2l0h}*4RF>cr9h}&w^WR0xEYib}))h@^Fidt0fa3&oN zN5bJyEbAQ*`^w{X?RK@OuEv?Hsc|G~Y818DDK{u_S(YvJOEI#g5F;z% zb}eIcZl1yJ5u&0HI7_O5K|2?*9pB> zj(h!TdCO94Yek#T1LAhQUbWn}6qjAmEcAf5UEn&e;abiiG%=9n<%K2?w+rR3uUKB_ zV$3Knbb+{C7^_=Y-VVCh?Q(I;Qv7$hN9bZA#Xh?;X}3EPcDqtu2fExAw`+Htfwp9) z3B>KLs0qaFYIyFfM_xvMgdPyL3uW<9w`j-m@^p;&$0Q zIiqRI>hlERc30E{;&!257d2eV`z7Ydnv8x4T_A23%KvG_@?zsBh< z#RIbW_zaKJEUbb94(6m0=BBU}$jpZGl-lI0H%lO}L_0@?y{G}=FU35A`6}#XO%Q)6 z>@atQ-QtG$OJRq3EbMC`?o2zUg}t;L;x9#hnBT%~h1f6cTo?8=0f@g8`C;B`V}Ky` zZ!5+;0&XYrcQ@g;jk`$$#XjPu_y}pIcmQL1VE;jk;Q{Q&m>s|&jL`v9kv9%8`Lz&l z`FH$nNjlz=>C5W}Ps0S1@L6 z=9>6@#0xdzFOabl0Xbb-c+J^2#v zC)AH~75n+4msmdsUiwV$66KmCJN9+>CDzxmmv~>H9L{AN)7VYc*V&s^{RUCaE!nZZ zb2nLk<2RT3i*p_Oo4CpPd*bFky<3#?OSFHdZnFNJxw+IoJQlEjv9DPFX1_Y8_lR=s ztbcP~vHr!sTIwGjE7-rpSFC?0zWPk>73I2F|4x0y`gi85rT*cug#DZOlJ)QCmsY(` zlnb!_&3|c6&mVj~g2x!n+r=-vG`4OTSkA8EG&U;ArJiz#KqYvk$h<{rL@o$B40%PeQuFXVj1sK-`Xu-HPz*3BB1FV4Gb*AIr zAYH}t$vmIJ^EI&eH~8IcoDjU9NZ`6UA5f3i+4+T1etx-Rss!vPmr{iX==F6|&jEIQ zec*tush4dx6&;ZBpD35|a|`F2;Qp5he%y?FIh@a+K~?K;p-V@V=+9e^YyZAUuanJj0{Mg-2QnS3peM(_&4Ok__s7) zJ@EBr&xd0n`0_a)#=oWc)&t+R?D=pkC||}{B+SF=!muj)O{z1gs*Z%JDhn_M$6yTn zBg3i*zNk6IzqJ5j{2S_x&rf`yeM|nB z{w>|N`=M{Uux~2W9>$evuc|tKFWdHqRLL`XyY?9WhI|1rH{`?k zw=~}d;MG0EnItyf5X0E{9C$jKIq$i?AsXCzNKf3 zX+JimYw2a%Euk^VbLe*MG5!ttYP5VU!1rkOd|K@>{tfvs{w>|UHsE_4`BsU43rTPc zWBgnH{7Txh__qx0dHh=o+@mvL{F{Zp1E6q>-K(@%WYpdO+Mj?J?H2ewe^TG?Nc0aV z=0S_vBCc;?9LdrAH5$8>zcX40_#vYodfwj-+Wxny2zk<1zE zS}o_R*Yd%-X70+KPrI&R{2R8}umDSJM#u3=mR zu4@<g>gmb89_{%8#=l{^G5#&x?mFPxl0BbRSB!r{ zK8$}$^VI`iZ}xmzUGL|782^^$TMvBOvgcb(SB!td_G0{-(A6h&ZG?JuWM7X~SB!r{ zUW|WBxBGtJ+m$_^R#%LFLq3duOY=1Y-=6IGw7O#a8}ecNTbgeJ@a@f>Z#7*p{tfxN zIX_)ngsvW_=i%(@(dvrvZ^(=BZ|QdXfNy{Hd|F*G{tfvyYWZ4#@6qh}w7O#a8}ecN zTe^L1!1p-vt@3;jV#mxh{;iO%EqFeUaS`YTFfIc9K&n(cFDqw$Tf|NC=Vkf$ylfD@ zhuHJ7^cp~)m$5a#Al87@o|g^6^Rh$id0E=e)8}Q3BZqVR_1_lZzDoQmzRL$LK7aWB zpurDw;Dj~#WU6z+21uyW1Fpfz!yER6hQgB*^eu{$ha=Om z@P_g6nZwPI=;T;Nhddqr)>f~#W1!vN))8uJ9}0Uq zJbvHMpwBnx3k`G(v^0|4qZ9C+(oPv0w0rkf-VC{6!}#RHNH`jdgkmbbfiXBW8H}m$ zhQ?^*@KkUxG!dMdjzlJ-F~~7^C^RxUF#`L;!J+WL^hj`YVt6tL??ViR;LVDYQ$c9w z@aTAWYQt1?a9PV~;vSm2q~IO5a;OzV7w^O6x!8D`$Xtf~KA!(#hJ6FKpUSXr;`V>f zumF*?uU~G5^Gj;~FQMi3UT(j*eYyQHVc)jgKFsZZ+_Bsa@mcUOk@qurlHC5q&gJ=E z=Jr2j@Vvq8f7`V@{}pb(mBI4~w~zNN&;KR2x5Mw(wV%Iodn_Y=ircS;w7KEkoH|VZ zJ@x)_GwJ2_c!vEMZileNWuMF3zRU=Tzph}vKNoZUJ(Zb1xPtxF73}|g1$!xVsJHV! zyn;Qvg8i8l>^gMtIx``{q4SQs0Qv9qe$@BDey8`N{!g^u;r*z@H8B|r6BoP&7TzyQ zTq6_HE;Tfz68;zMu_$pxCx=3@5OKxAvoV?*Um6SM@OX3*9PQ8$xLM*Fn3^Ik_PS}; z;1nytllWR$c$x0#;pxMW2UZSzi7&yI+QvcyfGSkOwiuG&*k}w&j82f?5fxl?a3Bo6 zJm_g@^#8xS*z0LwU0PO!C!-EeW*a?FgO<>f*+@@&Mh0M9UIjE^83Po^>;^nK z$mj=Jw57Z-o=ZIyBcAnpmR%FmhlBVY<9m4nFP6bqq2FVffp8SxLVfQA=~NvG4uqo7 z(eORhs(FbteFO8ow3xo~d1i3%p3Bg8U{4K3L$N{iUJ8RR!Bm}!$!wH!*-8DdoOLg%S)ewTc3ESXU~o; znM>FIQyQ0LJmsg$Bw*(MC-6F5f0n2-lC&(ItBUgCdPHD9FN(_*)&(mT{fO(7B1rx3 zRBBR`4`5^9Bd~!b)BK`5H`CK*x)gGQxPB3sSzfeXAf6855-#c&*GmF3mRH&bQ8yx< z24cTN`2cUAz&PyFegfJAY~y)5mKWD^>FWTvfkZ!hI10ebnh%}_VOe?_f_Bm1$1s5b zIqV~@2ZeqXZik2XAcJTZe)e~$Ua;D24XBkEL%>0?@eULrVh76)S zV_W*0nKuNxGHsYU+w$05EbqobnvWk+l0CGP#EBrZist`o@dMs@?&5Ih=^!yQ_@ zffv~h-1Y1(Bie?rih>%j(kOA-I8N)tu~D>jP+)i4=k`H$9xT8pgcbq%&;m(s=tCj) zKQrg7W~r6LtzLZ=v)?)YIsbS5`OlU7`TCi&PcjZiibYm(qT9$p@)%Miug9&-Sr?Yg zBMc6rRt5y+hF(hN_59@tI^gepps;5X=-}M>M!!*70NBgCIIrr;E&h%%GcG9u> z2I%4b_L^{C_?ai~-HkscgG7-O5f3`+P>=vmK7IOk7k>|-wGY)&POq!Ui(0L$73fK<_2% zg>Px@50aPEVpCHt7RtGwTTPTqGoROL409^HGJ{Wyeg7HTsn157) z)I$l@=#+WNYRcND%uh3~D&_>*ZguuSt$vwYs#Tg*rLHZNv~okq7hchF3Lb>NzkvSI zoYF9Px2x4+VX5r?i+ki)VewCsMYX)tRF|}$K2z1or_M^JKHe57m$X{FP$_?xL7yZ` z>MKgVs5X?BE47Qn0Nf=Di=eEQv|9<~g^9URMN=m04d7Fj)P~lqE?#<8x%;=6e|B6c z6#hC{trado{kD;J-vND)tZ%QwgQxjX@K*!#xJ3?lr0|zzz^&-$B!@NUs;%*q+1Vlu7mF5us`jV@V*&v!Y)S6k7H!t zJ_hoAP7WDlLrxLp7qA=&i0=Dz-^ECu2x*ICw*$+HpWM8ic;?yLQ{Vd4KjVE9KiDi> z=Y(h1)Vupp?@eK(kfP{?s*-IMa>_(DpHpY@b8~rhZgxSN+|`iYgvq5sCJQajvn9*WmxD~WqlHl`fsj4n{SJv%=vx#s?TS(<50w%f_8wM(=Y0) zP>=O_6!Bm=AYb)g+3W~HywmBR4#Suz{-Jl>?Tr{7u&aZ9xPfOjhrYkTqZ=EpE3x%9 z3dUQ(kbFfDWUp7yuYg@%C!?>SkYt3ydOIAEjc{1<4TKZza5&xyhvmKlVcq8pFTWcK z>Fp5IheDEfAed+eF;`IT8wl!xGq~JAe`59*LGNxHL_gZ-pg)FkKMEP#<}hk)8tBIx zSJ_b92u1XEBq|$`h}2?Zk`a^izTuHXdt@Zu8X1wr$42xHdT#?S%vLO>_YFl7?MNiv zibQ1bUvN#yhLo3|o zV_-`MY{9;{%0}Z`(U{(j$+8iPfjx1_h>z<2qlrX2k%+ev2|4gMwTD5icwF}%i6z>x zSR8DV14GmvC+b9_`URLT?PyFkqESgaDks{q9B;|8>_1Fxaw73aG|`Sm~^_8$V9 zR=LfQZJXStP2cI+Ciequ@}Q5vrU%>Ox!0aGZd2a2M=^U18nk|eyTpy zPu_d%!eiy9>jiu-4#jjBvxkgbLLYD@mhT<6nC^GRmIG+Nx&AK07>S3YI*c3m3+943 zexr%@=xDq(Iw}W_j_N_@=(2t60*9iy=!`BK=<)`K+8bQUy^i|N-*$z0Sk~unJEA<_ zVr6+h>;vD0O)79i)@8THz;yx1?Kw>IAdJ)esM~Xd=8w7I?gR7V1VF$fngy7h}>vJmalM%A}pUV5; zP_d5%m3v{2wvP)w6A*d}o;$Y&5E@5r6Q_}M=N5d_okyc4ei@CM2p`Q&JdO1X^yY@= zU19T0yvDlT&*Hsv%a71g$oP*>=T(Nq+)zHtLix*TfOP|=)Daf(teWAu;{u3JTgKZ!`kNKa_EdI1M!eZVhHH-H| zDb3P%#=K9I&atL%$XMV$(xnWGxt}ZnH(du%UcAPznESbF7WeblEbjByEbhhYX%=%n zcipn}`Rf+X`RkUA=iX+v`@&<4xxV?f#r4A5;0LnSu)Qz7ZE>EvVR3%*hQ;~94U6-| z8#RlI%#C~3rbJell@*W2Xm(aN&2yfd z6F!3 z*{5(MhdqFB?1p)#D0rL!fj=$*zozhoQX=#{GC)U#Gcq^TjS5m3?S}G+E zpG@gIJj397H`p=)=mQ*frUit)_n8F$N1UgH;o-C}GI3hyq#rOAr-H)BKu{2zX>%-w zVJrmT^8lY0un!RKdV<#(5`@P9r$Ry}m^R0D_+;7|+jN>4K4ayF2hxJ~R0!G!m+@E# zgij!RqMh$B@XhqjhsT2Qc?qB2&UXa(<}e>)+UGdQSoSdt<2YmG9Rm#5#t06u508b( z*EJR`H>0mGZi(fN>m88}W}f&2Cv+?}0pcjn>g^mEodoOAcX$5Hf7zX&JK_dat# z{tZ}DtlzD#v0}U>_RH)(w=U3o2i%|80k^)&vXa5FI^!1;Ya(tZ%9mMI=3OA$&+2&Z zfF&5kdx?vC=MP?z-hg%Y+UIQ4hun@k8%`LTet35lFbL@Masumh3(f<8N8Lh)6HOoR zCq>H#qR8-PtUS+)0@x?8PR<;2CyWv1+ei2Y?0g4-@A2OG@R(D+0O1SS`3?c!@!t9H zm{Y#}gm2K!Hw1iRz4PHQ$9z6A=6z)S;r9;{{wU!$*OCv;AMDp*=x3t$e(=6V?LI(w z583U04EUyc=d=4YLil2KzQe#b(>tHtuLlX=ke%-c@XcX9!JJ2~lY%vm1c7m#vGOin zFy|beN7Sz@@vDpYwd14fM6}k4&u6U@7S@T6k?~q`?1N{3*=5e-v*xwW9WduPnDYiQ z;4|C`+`)`5o-cgx#T@uz4s6c>8gsi%GG4g8;$(h@2x!JC0l$L-W{}{RHq#}7|8=KR z!Pyk4hwaHsIw_OlU4USll*Qo%iG0;9O-?4qljHC`tTd`O8rAxVu`wv<%}g>|DUF>| zb6TZrh9N4YYO|q@6^oZj$E&r_buDKBX?rK&b` z9KulQ6{Vp=h@)05)s?JTR_e`awNh(9i|j>psZd^mdQHh`ndXvGDCa8*1QZk%LKBs` z0(RyLMXf$ouVr^wZWi`n?p+V=n#148!wq;l9zYve)YVzKfv?n%jHlyXHe_mj!#X`~eL3jD%AQzoXy zr*)cG}ykBso5z7OtM5;IY$L{n`dVB z60R#ZOA3yZ?o|i>)?$x38Lfun+E!#9h{(Dt~AHEboxdEqUv50W&yV^8=GO#^A zx@6|~cgglRw0p7fAyH-J^zVQD$N8cB-eKrgMS5=0da6%o%%2%xZdy<8cNDwzsr?l3 z@d{h|wm!X2Ql$4u+Mjt0(gNTSc(9E2pCSfQG@#C&QuYGm$Kb*G^u9}fH$i`gxBYuN zs@G+}4AI5!fD=96=cy1)#W0S z=QlAzM;wd&jjxK-|8)MhK}LN~G)(x-uvNNl7eb!)OU2h9vzz{#BUX#gyDGc&{|^hL B-Rb}U literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Token.co deleted file mode 100755 index b0810dd6b65dd8e332fb5439617011f8e5833838..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11272 zcmeHNZHybod7fJ?xyvPQNu78UNm0DL(~3G#7ImMxJ0*1~N^%rHb}q7OxQ3F|lH9wc z#V_zfwu|JvcgGwZL%p*MJCKYBand?Y>o~Cyq;OF5c<7HJK$`-B|5$)q=ugq2KmoUD zi~K0WzOy?+@u)9e^{asAu=mWoGw(C+zVGbJEN5Rief9~auP?zOE7{O3)JO6d5+wiY zgI4CO9m{5s6aMc)9^?Qq)=gN|cj4~{8AYD7!(!Se5p9D6X-E-yt!k_XCLl1tO2O@z z7_`b1JN2-wi3q8Gd^?t`BHE%i-_ak|`;h2;_>|R1_;6YdLNv)JH!-~|#Silm4*X!}(tNPs7vJoT0NKR{^pXKFs9SJe1L ztz6J@l`H><2HuaWi>0!XF4hWF>D{ZDgn)lk0`H>|1Rj+DT=i%^7=2WN zu}3ACeJH^yjd;Rp%63NlC#jbeGyd&XrykVmSMkMiu~t$l+G1WSRF#G7OIk+3KK#uE z^cUucgt6DIR&&|K0{_dqwzYCri}Nh_Q@E1mqPEmAIN1l=j%`ZGX zrew2!884Nym!P(7;+;D{@5d|chbaD$Rj8D*xt#S}q4(mIm-DG&?&jfMyNh4Ud>pS8 zvJ1s>-ZWmhV}0K*;dubxM}HiQr%MbWb*C0)yAl+^U(%+%EMOlEc>eN0`LC04{2v(@9X88tON zotm8Z7u1Y8mr13jX6~{a1Q#sf8CpmDNsFDh2MS;7 z`#GwEGw6ENS^xhrZ}d5~OuHG!7QQC~&|Jc8j;#PL14_67>)0B_Wxz39wmY_P3=)0J zRsztlGh3$s*AOH1A*0!6k1sV56Mbdta?@Y;Z}?mO?@dikPWlc1M$7MiV;g$1xKr3MC1&qK(D-f{s0#}XIKw#}nC^PloMz9q$+XRi^Mk^S!+61o|!Ij`f zAQ&+G0qAwvzroy(?*SaE!Qk4Pz{%7@8=+Rn>?dS|Hd>*O)lcYcBeW9Q2!?`&+1GM# zJrKNV*w9Lofj<3EW}xg}39dJ6XbV5KzqT=30Lb*sns?pn5t@DIJ57X^oBmfl>u!(R z=r{UTnuv{Vu#RY*Wu-O5n|@{Ok)=SxyDoSI!)3TuTIlsn2gle}8D6rZfg7uF2$hT1 zS9o4V#{6e2a@!3H*%ihuamagP8QOeD6lI6Ut*^M7VzPQwYd{9qHJ3j4nAZ*u7Rru}kcy@7nuy3a3P z76jSl67`wVDv^HBpZQ%ivGn zHRy{qe3;88_YeAX!RA|Pp+7PEhytJ~ASU`$zN^dUq4C2-PDIy?-baYlK44dMG4|dqYt7 zW8m#WmOa3O?HQJ04M~dDB}o>CC1~>===UMlkU!S&`=fQgUl#ZH^*zgu=62mB4AHcGNPlp=TI!xh{dAySWF%`OzmM%JsQtblM{UiHa*xL=iTKJy1@_=MqBlPY53tE)`l<9#Ke_I*3&+Ys zpBM1GI26$#W)F#7OdqtxmhO&QMEBStO9N=P`TVUxjKl+B9pVQ5g0W!6Z#32z9gWsU zN9BP-qq@&Fy3`rFfqh|Jw1t-pw6^I$jZH_LUq{~ax9kB99_#bB*f7V{8Cl*9^S~Q0 zNe>*9b(wb>xNbn=od;+hgi)Fw<(&s<{xKe2Yj1vHAQb2$$Plyyopmtx_#3Judis-#4n>U6X7Qc6Hj411ATYX`Hs;1E`G+k z?$6@Aeba-`8D#wSlUbEvF*lUYvQYlAI>7RP33Y_UeTfLoyPrV*|K-HpOCFK>l19x6hVY%zs=1ey|PX=Q01|n#G^gMp$}}nEwfFf;D|Xe2@D& zo=>v4kH_=pSQ-o5S2CYsaUW0Qtv=?jFf8VN=8DDr+?5d)bDqCq**t&E;(6wpW$Sa- zQtiHQOflE_YZlkJx4>TVY+=r4-?BKLf6LR$9mvZZr#B-jO9Jt8vLzu%iIRH zV#$0CpD}0*uGAS(s+;?a^RU10xf6zo-j*opuvEgP(k%OkJYd*wr z0>ZKz;GCl1bPfny5b)4|&=P&Zm;U_Yrf|Zyg!iNW;qx)VX)7;ye1dauKycc`r7)Vh z!S!2weT2{6$+s8y4)@NtuY->ve2z}OeZY65cfLpmpN;TwoqR*UceHoDA;Kr1f4dR> zQWCiUBfp;6GAS=764BK}LLNSm&^fq< z!S_7aG6LuZ9CjuJgx>yCg8w7VQ^N3YQWzOOCA5+ch{Z{tFf!;91Y6RK#W2J|06usb z4Y&aN0R^W|a5?>gupe;JFSL9~Gq%Ggl2&Y!NoM%8l^-5V3a*oW!3Fz691DT)34~AV z2aZ1AZR}2@m%qib)!ndcB zZwUB~_Rfc6j``dq=KbXP!}lK`{9(dxK1*)%eNM0qLqFra_k-s(YIl(E?(4LBKk!ZV z&e!SJ5aEk-@*M!a>E8J|{koU%4R!Jz1il%}Cz#{NenPOuksvVk(^lTj3Fer?pjXMacZVOla1 z{7`rA1c!}M8lTTd%Y+_!uskzkjA#^rdfOF2{5;o}U*;B*}Y4m8WSXk7`N=dEi_zXy?6qTwD zXGG;vzM`bnf>NoKO2u*&TBI+ki`l{=)N4vcOVt*YY+<3Oz^RO)!s$-2qJW(X*_>86 zS}CWuS#B0~VD7!W*_Pt75ZL*)&sAuJ6;BSG$A=^xX|I{*P@SH?le)u1u6MORPUNq5 z$)6W*N|~`8JXNXIJ|_CUU(?ex1nc``Q#{<^v-CpRV?Q zMdV?4Kj=)qA@a*zJpVxC{~~p^M(A@Q_wC;0oPjf>d-4C?edJ%ik6gQt{9m!0 z;Zfr8?RW4p$nSN2_#=?t>-_NVvHTwAhbUesRy7odLsB?C~#V}1^TxoN#b_Ky_1^{M?7hd>vm`nEpZuTrG@Rob7qY|;YY zLAbDt_P<5?r)WT3XG(MyAb%7ttWWpT^!pb2on!lZMyl6kz%YlYaL&-_>7N5k1+h|CB{c^}E}5m(M}-ZW(5v`WD-r|J`j> zsOpyC*1hUqBKl)ksUuPSxh`^Ar1%Qx(AamVqtEfcf^>#B7W-R*i~66A{|3mY?;+bx ev-_>=wYJ2{(0-}-3S{o3e}m}%wyUyR|9=6!oDX9F diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=10-K=4096-N=256-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=256-quant_type_w=QuantType.per_Token-dyn=True.co new file mode 100755 index 0000000000000000000000000000000000000000..8fa7076ea812ea9db5c0d5e24995586b611a37ba GIT binary patch literal 14224 zcmeHOeQX;?b{|s94=pK*mKB0dAUHa+l`w&?1Kdk{a}n7HHANDT)X<7xn@Evp|yKiWF$W zzS*6TRF<_yq%d{jbEBL^e0-4W}HFF)q$=d4M}4dC+W$sk~pNp zvQg9tzgti@(t{Z5jw{7C;kSg0qLI|YV%jDVt%GG!ks`7x#aIuzfS?1))wdocPRUbL z>*2midr0}C>#?k4k%TMZOVu70eMR)XdR8t)6qOuW^BK|8LP0z2(RQg{4&SJyv0vKl zC-w)vo&-JGbw$$NFK!FpqqxHx;9F#nD3UBPK-~q%aDe+CJNWSAGYFmfATga5s|o+H zOr@A9R4@Do>Uqzfm?~FntL&0e9wP8QJBdH$8yE=`$`odIg`z1Dt90--YtRkZVBvnOTgVNf#+@s z0(VO=dbb1+E_aB7Q#52AQK~ZPka;)xf*=RbYOO=JtMxnoRHZai7OI)4=}fUEWOFAn zX#scOpD&}okwY4SceOZC$W0ZEe|?i2D@^{{Uq}?EW)f4Gcb_O{iU$sH2R>dE3CA;) zYOYkgs70UprxPcHY#~t-o-b97kq+P{8F&baiRsLWS9mTsGF{3Dp=u5IgsDU=GgF>C z{*{?tEH%wvDD{WH}}RY<=8X5>-=^gea$GZ>%MUi{O)eXQiv-T2V| zeByW}TPeW^U{{X6Po|U_I6qVS9pt7`#UTKE!(1~oG?hIu775X*hdNh7<};X{HoWk> z+0y~D7+F7_b}H3;09!irNFo#RxFSZ+VL4(~=Y2ZwVx+f&)a7_igJp$JKfmUE;;C!9 zuDFd9q+1L4tFA{iPRiHxK}u}CtR8cn34u~;}83I<2U$QEKf@nt1DNu&~4 zKi5}May)>e<3m&j=h5|Yqk5Z>NgDmKTwkk~w;B?l+=UMPvK|UEvG@!l13MpT|$ierRB3rXBh1=6rtA`y}QmBNx6IW@6hXu z2)(+rRnjkYqWaPS{qSeF9@gp8YG-xmi^C~(OWWuBAZ>OG zNL|v7`6hZ}QE$+8oMN<`4)uI~3JlhmSaF_Zc_dB#UWc4|NkN?fSwEkH@|P_ZUem1= z=e1h7js`i69Ml2q99NKyE2!z{wutjkA3@%LoF-fjXef7|U22-m?q;)znk*}0|4*F@ ztkWr(ppQ-Tm*6igjQ!=sPV^Uxy0fl@2C}>Bb_ajfZ05VV%;H(-V;A)G6moEq!znhL z1H9yPa+W@)x8ZcU>rN-%z1=BVI-IkY9S*VKfO3a}>*}+68+OcP=ezstqPfF9+eGik z?K#m~i`&uLi%s+^$Xk&e#%mnLOG581zQ#D*l4C$@3~;Ir7bm%Rv3r->+i<(x zb+?y7~^`wj4F&}9cFHaHJ2aU5se<@Gka zUU%K=<$L#fq0W1tuR+#9&fDNPcb((--kluC|4Q!Tvcn@bJU(9Xc(~qOes9C?ch~)X z-Zt!qI`4z77WEE#ybX`XUH5o++a3?dKLFhh)C51oAx0Vw(8s>m#d{k(@2>MaZ`;K~ zy$?aZ19c5LybXuLU3WNmTfalJ&*_&Af*+dT2RysI#<<)~mrs;1moLCeJ|Fla#7Uuu zXp6?YjabZGkHvWV2=xbp>Y$|`&E*G~Y8t^s(0e3wR;B7(b7X#W7c6l2vmm9{Gw+(_{ z=JiVtt@_0%`{g{?(wmwD{#(wE)v5We}BJ2O_E%j-fSX%U{U%F+#e04{^ zbl>C`{Jhxc{0XtYWeA=f-<{$lCiZv4W^aCOe4?$xH)}^*<@p%ml;09`i4eD9!v)Wx z9KR87BNB1fBN5&njfjqp$gDbc?Sn3HONVP#LcdsaqF*jL>wyJiJ$g;=G;YVJFHHwJgFc#$bOwiLdbBg>r|Es2 z!C{&n?+l{xl`kY{N<&afu!A5&P)ATt&_K{gu#+H5&_vKou#2FDU^hW4!7T)P2yP|V zOYk0oHiFv-+6nd%bP(((=p?wE-~hoL1UZ5`3AzXl666W)BIqW#o1ll_9)ezidkOjo z4iWSd93~hb7$g`X7$z7Y7$q1ZI6`og;26Ps3EoF=AHn+xK0xq6f=yWaVf+z|#}{Ca zfY9QK1@G5MY*Lw-Q)DgDS?}g2wI=-pAXCFf9Sf}!?afH-yPxpEVeYXPpw}Z$i<0FXsZUx>m4k3?>lPDnL z(`Z!2BWPU4qv#PC<7iUGG&&-qh7oJkgxYb zK7QsfgZl{i({{+G&m)<@BI5yk`ofzG^^p(q zM=pHA$i5`85dhiI3!gC&!1#qL42_jh$fqy-f!PNrUifBJM~)qkJ$_EhVEYcA>ydSE zEMfb8c+SOOyPi82QgldcVcULu?jZ)-lstEsxp{sJ(fKii?aZD_GT6TSxymXJjzw(a z6bGT6@NE)hGSeH^RU&L3Y=>`Y#|!g#5kuQP;g&0bP$ z&0iw6f*y`#Jf^i3#nuxmJg`kP zJ9}NR@BH;O`|z`b?K^%|vG2vJJ-Jgyh7R^+&c@*~*h<2UBzxQF~v9KSE#xG2XxoF|U%hdK!Q;7bL$UYbjc=H_U{LC6&M(RDP2}CE z7v7ET!S6=%u7##ed54nU+pLqzh#0zV0Ke=~)|V>afiEP~>w1d}zRY@DWDlgLmgOzJB1l5A(%kzQF#t z!WWNg0~1O*&>d$@359P+#iu2FdNtp6;Jd$lKEH~ugYX&Dd^>>e!S?xv310yH=jXmZ z&7&ZoU&y0|-ol$;pY{_s_+pzyl#FR{E}iAVhOD=d&ARK^EI*e{yHBOld^nR94RD9S zXGZ8t1kel^Hf33a-uN;E^F_oYvf-UsHnLx2o7vm=ZYs`35^*-rk(I|H3}X=hzE0o^ z0(JogOmQ}7O0t1LzynFP8PCdN5#FCw#v+^5h9{JCIFV(8Q%N@15ue3l5g>dl;j^gu zb^+f+`+RsTD4&V&b*uSy1K-2#^Wm|O`C7)JZp?~vS&@qvL~lbB-E~pq=U@y@!5BoQ zvZ4WQ>~kx$9R=(K#Qw&<4}ELo`+Qy-olw%zM4k;E zyV>^+!Z)Diivr&>?enR9-%t2nDN$V0UJi{!jgXi*A zzCUYZ#W}Xc_wZfs6nvwL!Z$ht+)b-R`QB&ATjBFM`kB?rVc+Y(_ouB^KDxy!Hki1( z9#8VdlvRw<_gff8<1wpviYahXp(u_d%HBq~?5>x~e4|+8M@t~vRuu8tJ_1{GyjRp& ze)ea1?iaA$ocz7fH-Ma)?-rXqw0*16eoqf~x~GRXZncWIJ>16WLXX&)hbcl zfbzgrkZtRc`8#jJkJlj%d}{>c0S#Ri*4Sm0*EQpq71kk}>|@h@o8n`eO>3G^(xz@3 zYusjKjVJZ=X-St_>2t-D*Bq*NhLg&qvoa zGnq@f$hwC8=^^|+!Y{8;X84-`_Ule)XMg+csP_wd2=9=(--Ez+pnX2IU%iCSujbnY zd=u^Ssr|Z_@C~c^b_3tT?epEvuK~gzA^h_CL;bo3+Ih77cGP|i65gn~-+O`Y2krBz z{Td>CF*V;1@IBT(pW3fs!Z)Jk8wS27+UL8SUq=c5KEf}rE!3|;Xy?iH+fn;@?bmw=-vesCDDXYgKA+mJ_YuAa)qEqs_an?_mghbFezP*~nax`Lgp$^G zn_-SLvwE2Ks9*JD9x#x3pxJ9x)(xvonYVj;m30FS!JO`8c@wNHx)j{U*!63G6|Vtr z$ZLQO)&K`y1GLI}Qj;|+Tv{!68vc%@@6d{x6x_$O>*slPjYC5?%>OeF-&G{|JIEY- zpOU`6zxToSG4Op1+%X2|nZ0WyVS-Or_`+uo@CAS0m#_2qz#Uw$MV7DQ1poM5vxJl5 zq?{pgk>p8s4dAdIm%_)BIC4~m3x)gve*n&La2~N%D_8Fw9)^rKlk}%b)5C`o=}f68 zA7d&_muG63;X>i~^ia7{%4bqFu2M=3jrf6q3&jG#2se@(4FnRwu}CZy3XLU4BO_yp zkcrP_78)1n=c=hYlS&NV>s!h6|FsJMLKlhrU<=QiM4ld4U;h&#Kib0c z8Ihk<3}02cLga-Oo<9)zw}Y!y<-{L}{O@Fj^y$$oXm98ou6w`Wv06`xlUJbe``2uzZ8_bjV*U)iTHr z=LX?GBJxiaXZ&KKDkAcW<+Td(S4!zbErI;C%!!&@8=o=+ai&ly!NibG!?b|>$!Zn( zmE+0&R8`R+S$qx=PG9DxXQrVJ%yjtFC&H&7Yl$SF2+b%d1(A@?)j%UxM8fkTOesP# z19OiMjK)Gj013IQ07srPI6oo;k&sLZ#gZUTV8LhzrQk$kt_Y`@v!z0z^gQm}`YI6> zMK}-^LM`R7mhx~*`DjZy@Tt2HQ+E;bM_bCFJ+)ckSXh826;Y;`7KmUSCvdD)ppIz! zysE!AGcDlrpqn|!h;{IZ@=fX_GZlQAck_z!2&RQ(qEg9aHr1-?Fs^)*bu&GdkIo)X zr8cc2pCzuQDv4T3+>A0jV)&@8+)?loKCHW0^-3l+BOh$utRfum&w+9c*7iaH;S;{) z8TnxJYJ9ZFNZo6Qo{AIF`6i=s!#uJTiTB9@O+V{WQX z_n{P9^=bbp;_U$}4XgTe-%FA1due;}@=G&-cw2yFwEc0?L5dQTsjqQ$2GV%{h4tya zoSqM-|KCXBpiq6abO3KNur9U%Uv!8mTk9sEs8xn*x2uoG7wg04|8*DDhi_i% zE}EtI5|q-hS62+|RdR9SSnThea8v)&_-}xW`d-mce$V?Ad|fSsG;NoPFF;}={XNRY N9=T|s6NOg&{|7=n2V(#L literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=128-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co new file mode 100755 index 0000000000000000000000000000000000000000..9954e2437a6bb983acba0a7f1a0055a99fd08093 GIT binary patch literal 17688 zcmeHPeQaCTb-yH^`0@BDlA>surfBAwrfG`$v_vUxDy4Rp74E9JX^?qF7)qk_7)zo? zQjVK>>XXETb>`T0FmG`S4_TMCX|jH8*pOg=qyz}C49y>I5uiew!6-Ihf3!f?V#9y| z=FYhv5=lvpWd7McgZjO9-o3wf-Z|&qckg-c`b($Jd|I#5jYx!T26C6^_&&l2-~S2U zIn#o&6tTeHR^lW^IFEJt>23JGoJ^n@%HoD%aZM^{__NLn3M4L80PBGY1QcM~fqK*u z>{wuJJ=9e@!Owr89_QI1(PX|;b3Ls0D_-wcS$19^FTG_mUQZ7v_HsROU168A|52Jm zyX;j@;j2G{dU(D4Zg?&HrBLA=@HOrvf&7q|pzJj`2m_9O_SEl%uG%evc+P$IG>rwesg*zKc1QS zRx&>|Rm_&iH3plZf^2BtVnu%VQMN{_!t6nvjnbY3H;3xgqtOZ zHA~RlEJ3PS0tlCniGvjpGLNyMQf)+Va=eyIn+&?Gh3%QGMepAVxDVNH{KD_Q*F z?09}=qjT>UOkS9LE4eV2o5~kv=?$c}?m+hs$%SIJn4bJ1+(vF94cbp!co+P$sl^Z1 zf547v)%afW+04c4R3Q%@KvRxk-D) z@%{|DryV!!H{Ay4#rX5%Qj8VD$49HdNHpFdk1#PghjJnl(eI0X7g78Iv@Bfs2+^Tz z=B+z-6@6G9Pb(G z?MwCcq^2@EDYHjeaynNU&h+;6oamY8>F!GnWX5}k2Kxsmdk6Z*$0t&mi4y|@eN(+X zJ%dAiJ8^1EIL6Ou{Q0?4(GC#)mF|0_3%bm*YN2ckFY>1&ZuH&UOKtB=~jO$@M z2Be`O8P~h<7%&A$T8!%>03$m6`eT4E5Peujv^8BzVsVW~(J!rET{BDibvQ+?OU7tf zGV}A1MSZpXFW-FQjW^nt+OMy)`(M`kU(wo^4_vDl$U252)@`piuQ^@vnvOibM#$1y z`xWOkn=_!<*%ilh&> z5ci!G2yC5IuP*EL3em=XrXztCjiHe&(PNXwP`50(!WQDXvjlZsaJv<=$E7a2T(q8< z1mqE@2WZ9mfVOppW`oDAE<V>v(vmIHKRIY7(2^XghvmZQ~bl~ipuh7Jcq zy`CRiUnay)%vPIb(d=+5x zsaG8irPb+Bufn~y8uaQ4iG;O?qE?isqA5z)eOQTC6eU_#6vcByQQZb*@zqFFtwdvr z7LA5IhokXIG#V{Oql))vRP`95i|dh?T8YIKEfx!V569w_SS(tO#gw*VG1Y5`Emk8D z^=a^3B?4t4k#OtbaJ&*m{lZGy;jrp5gcqyiTl5}mw7RD-y3nBGGar zqWB()sJNRDnFM_X!rDMeZ66wrSB8h9<>6r^@UYNF zPs#%WYWrX^UP&gSpqmot68acOHIZO8N+cC6kqG;S`s0=U{%E=otBMjkxiX!RHv`(Str|lbh4B0gHDzE>SN!o&x&zP+0@5D^{IkBW9a`q z*~xJqot)H8^T>5yy`09SUqGIUfk}tq}pysE(VB)^#kA| z--(0@egk_!EKvWY;+0e?T27^uz)(sJ8d8h3z6*3FRG%TSsF6QkwUVo=*0TK?ah<>0 z5;ofqjS53kjG-=Jww0xR9tYN13CQqm84Tql2T9ql;rJM>j_gM=!@Vjy{g< z9Q_;*a13xf$T7(A5XTV54vt}thdD+#KEzSsc!Xn=<57+=j>kC0IUeVj;MmDA$+3%L zH^&~1y&U^E_H#VJae(6>#}vmQj>8-u=6I6hCpbRB@sk{@R(LEy+Y#~hEIexnS>14l zLC9lI$UF*@VH^sX7a#Ue4r5ZtTsSBM8OA4-jjV1QfcUJ9O(AdVC!{t`h1?H=hT51F z^461t)W$D9(+PPIe)l#G!SI5ZDV`!OijNUL#WQ5&-bRp+G4dG2$H^GQC&`l(&m)gU zuCF>@mDhfXeN}^9*mdr0v=j0)(SGyeu_e8P*Mj3o3639Ma!Y2wCzm1;UK`}94UW$* z^-3-*f8|jLbpiQ|7t3EcEBUefm2nAm^AR#1#PU}PQW(qMSe8&%ke3uJf8%8-hUIU3 zT|(VKekpE8)F>{Bl2*zx5+2h2?MkTsn#6Zv9$9z0W?cmxRqw?k{ht)J>-2+1u}KUJGLh^?c$s)AQWzpVMn$Y@xnS-DdjEzoVB>&(b^0zL(x% zx?O&U*>(PXrc>#CrqiYOnNFAAFKk{5V-#&Ve~0N;y7Mx<7RD^v^wJ%s=jA(0&-sma zCDg67!F0Q{!EAVWgXvUwkLh&rJ*Lxh??pDRg|UyeTYQh{SNPyjdM#`VXuFFaFg>69 zfa$sT0kd7atWhRqZv%qt5 z89D(%$JJgou7+nV(Fn1R{N%?l1oS%qVVLK)8v?S!5R~m6Ku=Jv214?~Z@jhU91Shv zyye%SkluNk?K=;KWJgC3$_Ex*r2mc`8aeX8q?YovG{M(e!>8wb#wPgOHGBrnXKI4a z!}%QKmv>wb#UaD}%RBC)aq@yOpm@RFFGj!?#{#Nmj;l*?7{i(p@fAK+U4k+A3Lk^p zVI1xQo0&nEc0d~-jD;Y({MDbdIS7HVmF)8*&^F=VIcbsPXj?SRm zeh7V$puX^dFVLqJ;Bx}n0nx8c1NaR5>hy%AG{J|y5PVk7=iHG`PxaG{>Y09eyb-}AwI59Y^e)f-^*$UwqHY=pex^@h>*Ivd@`fA+O`)fFiiEeAu2 z+~HMyVyp}E4!P5-F4MUejDrsa<10Zd7gSz=akZl>s5*`XRqQW-j`kldfmirwwMfY zCff~O>USCZgnX@>udN9_^t<45b3R`aeCT(<=iz+qP4J=Lk=Ln+!20%PkN` zyVYKE3rc+qYVMGF(fR4Kj`U7Kj7nJIwixG{J}d5quHO7j1$M{Ui7u z;(SM&;6wi)pUmUl%HtmIKL)(mk0dxhjeB^S*zEyoSK6P*UTRl))XVJZ_3Gu*Y@hZ$ zZQJd7jO*RmgxMcI30s`!$Z**!|jUdgE!fZ?xJV=4`SN;%GO! zR{bLGZ7lBneirv&*Q%dABaCe})z|^QPTQ_=Z{+3w5%(`?^lbt54^=wH zf%%J(1k580jRwfw(|HaYo8Wx`$^lJv?+y0#p1>O|dwujhffe5qz&Ozg_qRfPVa?}P zEB=6@LEE>r`QsJ8KU((t75f3dYBTs3U-kLaiVx1iGtt`SjaR(L);%N8nkkb!F*MlLo5>gZ`6asysVJ#X{oyWG`Be;(X zeITbFjjGP0p?D<}ik3qm#oZzFft-FY$a4BHhSq|?u;bX)yA|gVp_73)jt1iuK8AK4 z0-d(y^n=l_>{%zs>F=r&S|0>pIL&oTANu?HwH<6@2zFzKg4EhWO%keL^bC6BpfLo&)c{97lpG_zmm@u|WN| z^=`|}-)+_U&V2~xJ}@`a$P24RQdu>Y%`B(yH$&5g9BI&ODogNQjpy*)M-??@w$Pl7 zJ8ZEW6Z?wA5*Pb1i{-f3Pgvm7vfu6;e${ByG=krcG5ap?9JUR{<1~K>wiYrO^FQ3~ zR2E*{XbnSdTG$h8F64H|QP$dD$XzkWO^fp23n8~c?yuG_wRy5OLb@?u4A@V?_cpxn z2^on|JW0AK!e@dMPhmZcTv@e1F8T`Qq8sjy@!s2T67m@4bW<5U%|*lU8O)WQ%&@%V zNG2lTwP4OYhB?Vo8J3ew!u@+_;3gfc#0!>z>LLXwG;z3;duP z$j>AHBU#4(XqM$-Kb>W~!?TZW=9ZE7k=e5nwGHH!(HEHSesq@QJwH7Q+?ZR&b^!9R z8!V^#_zlMWiNV?#&zx{*n#_; z=CyA!&a<}}=M%RX=egUA^Qqef#>Hb}d;7GC_GuNqXR*TNie*#IVuj0NTXGgFT%NF4 zA2e2$EEa8HGl%*$`VQJnURm{!TaZHyT&ubpbErnG>S)ZN8Z}c*Zq!7)Z0rDi7Uo7R zFs{VhD9%qYH)=7sVZP~=Elxlu%un6-nV)*Z{M5Z@gK@gW&iF7lihP(GeNaAo4Ik!4 zkq>jD56b7L;ltc0@?mcDLHV4V59g=0gK>Dr_RBkt4$O_Z6&KHqS`ND*HyTlwBJiDs zNr|uU?<|(!8-o@2#=s5Z91Hy1pbO?kJ%E@Sg>NuuZnPEt!uJ-k+o{MNXH>2#_n9*t z@yZ@Lr&Xvg+~5nmN6d{{0Wmj(+4>>g^r!wqY4{4D~jHIn(e#W6xn*j z2j#ts=nI+iVQv)pFgN<3eCP|ohq+PY!`$e0zWd}x+w`Ct=0?4X=nKl%;0u-;HHRgW zF}`Ai?~aT{Wr^oT4KU`#aedWgWVunJxqkeQ^In&Wj{l`rGvr2H(Xz{}NFK8)i!lR? zeWf-tnz3zDRmTR?VLqH_j4ygI^upv8zeqA%0jsALNgGvr1M zaQ~Pal>jj}YOoq*$!eAjPC%zwt{Pp`Pv%h<^OMV^H=kzvW|K<>{bb2tq<%MmpO6o8 zqsWK3(Ff&2zY9LhjUpfBMjw<9{Vw=0H;R0i8+}kd^gHsox!*m!4dVS{ZWQ@3H;Vi? zS8jE~v!hA7Vr~?9F*o|4de_<&bEC)?YJ#uUu9zD|KFp17*Y7^LQIkw#4&zAJRpNGS z;dZUsL|bvNwqmov=nCE=NYGYnY_4p=xv~d-ZEUVwANTmox%@L7#Jv&XULzVj)0W`< zzV^G{?`-h982BxZ0kLN1F5j^dw&7F{w-yHEE4bhWHt5vpc9G+M->c?vcZ8pZZi#Lx zKjw#90E>0P6SzDf%s-V6_x2{clilzgcQ~e&O7q3xt}ZxG7sitl`Pr_snaON^jxJ-! z&(1HDvRyMX7iT-?3;AcV6Qyt=KhZgu1cq?$KzC1ncyK(`-JR(f>K_>B?Hw9V^$!kZ z22+#So}r%pzR8KczKOof_|W)?W8_S34%P_`hf$$3XHIc7!m+NI{M>Z5ke<(!R9pg) zF6PrE6_!X8=4XrPiOgKOxG+DTFO;Cf#D&asZf+XRXVa6}@rCJhZf+`{h6M@f3@mNP z7t^5URBk3)>?#%}>NKYZ`%rhwf;;ZP>txu#=wdw@FU9<6BCj{dL%jTJ4f0Mdf2@}i zaQFn5-)$&=hRY!pTeqF(@~a2y#N+~(zt&LxPr2ORAb*j|`w!Lge3i@puA%(5x%`I> z<=^J=lMV8J;qtqO>CWb#pK|%*M>a)t=NDZ5pAF^zkIUyr>&wH~9X1pB_lCs>FlM0g zryJz({));k^&@U=kdKGWM1Ijw{xe)|KCx96_x_m6wFaJNxct?I@_)+Zt_JywTt1Vk z=l^Rie`UB{{(UYlHI#>i1hD;ARjV&fe#7M>4f3Nfh{5uZ0;=1d=JGlr9NgYTzOjqk z4r@4&zmexNyU4HXB7b)md36`Lhjx(LuX}VC`S($-Hxum(^_o5m@}1TXO@Vx;^+SJw z@*UO>k>p&ylqE@6Cj|?zNOF2^A*p7HD&c=oUMi4eAwQWZWk|A=y;P#5aa9$FvonP} zbcB*6n^rgG&Pp4{5 z_jUKBd+JXQHJt9PJ3R=cH_5SfU;Sw~tUKLbfBFPTKdVCjpB~S`gCgBG(3jp6W#=Y^ zpyqA{xw>#t+)UjCdg|{k)u>-j{q3b173`_MzZ9rb&tHFosln}c&;vKg?n>MON^0(5 z2z1=@vccrs!fYCsThqxXcMu-EzqwqtSs6WCL6sWYfu1(QCOD_0|3SITl~KY_%0SVF~1V_b1^Y_1Q2DS{;&A+ zU;IB1E-?|~(9qt;;V0}b;=ckiVS84C{nZ|3c&kex_xHex|-`N^w&;wKErRH_y#n*YmlV$8a5tU9PW+pDoGTfY+Y8!b|7kdy)`8el2v7Qw2m8G-`C_D_Mf!;1bh zbLZR-iJ~l9HvbxyXHdWQ&b#+_&pYSd`|dg9^{0;>f67)@*Dn!v*vb2(j-Mm+^Ydd* zvXkTWC`%C+{B0s$;s7z$?Ps?);2$`dK$(|Av1pTv)5Kw>}ZR>G) zAG;P__2313dh(BY$+_&j^|SMrLftoXQyI zvib4s=+w1;BmUQo%*bRuJvcEvUg-alk;#ut-70>=$P5kTvr|*)smaV>_RAxq69bvi zuNV`1xof*xIjLYBKAjb;FzGjSO#z&?zBiXP1$z*o?BgeIme0x=tKA+7` z#lI87A5W%pxqmPw^SSeIcU8r;Cm^pIQ>%l>_%^$kn#_%kvT;SeVoW_hHZU=| z^4M0JGtLdYVN8$bh9~l4RQdFp_I2Mirl+z~>7nQ05psiR(0uUR55YI9I{ee}x7k&t z65lkQ%bd>+=O@7WXvY4p^Vy&PhR+s$09PY}NttRW0r$EPBg zSvDRx-}&t@`|x@H#$Hw?4$x$W8Bn}8cvYO{n}|Zhye;NSMDYp1xkhJxn-67Yw*N)z z>F=HT(f@S)%Nd;S@MDJ$-n-X9_1&uF$yN)3i%2?c<|fmF)48Gaa4M4=$PNs2XEWIY znVuw9#2CpH`cgxgfsUSmR8O)u)7{hAc_7u%)6v_})7#mb>>V7=3}t$UpeBEXt$zfI zT^AdKFot+vZ7lVFOJdbImZ`=z$1*;X1xR1vcE@rMuK_hIAUT%X z@fwiYzTUB%g6sY|+j2k3>ut+N051`nR!6L*x_V<~iP&PFUcR!V7S#o{tbU=ptE)@3 z)P=IDzI+18+EisJMT$$d&F&Y23&C=b)(cv}g>o>+>IH9D^TFVy&%?E?*tXDCZlh(| zthR-6TN^9WcGGH`Yg-7m1uZoQtwQ^+zxLW|>a4n8+l==bPU5>W3uRvPcyx!?ZO*yfw48&u<$fp!=&*Y|<{aF|`cC3@p&ZaT>AkX4 zR+Lz|Tqb2%X6W(0Oh}N(UfFV4F3|077NG~g8u5#ZPI7y({z~L(i3DQBfT~|n6uqHA zF|UAb4KCT7C#nWcG)uvdZUuvyyCoPe1%t6-FsL`S1x>d-IP;3Cnk5zPtE$$}5{Q=q z$Q95VTLPwH56qOw-_iDD^2(x%{Ow|yyb9Mr(rP)Dc0wPt$lot6*wmP%hRjkZtXrXw zRLqq=8T)GU)PEfR^4VkBZV?g+(8p-`+C3hAD0A-MNt;H@J} zAK*cKc53mGrp1byrh9g3Q06O8zm7EQP~#<4jTKc@_q3{J>#SpG7wA$3UC=fQws34Y z95G7~UAH0;&?lx@v3AqDHy$s=`*PPJ&$^=6n0UWN(XzehLq zwvdJQ{F)rvEzWg06cgv|a%hh@PskzCzw&bn22mYH8%H}wiDNxS2S+DInWKxN!m)v) zn`0wK4@WOYAIBz+evZu?0~}j8svKK6206BI3~}7XG0bs0#|XzA9Lrv>1skFI}H|m5`+F!_h@B*&%g^)KS2&wc7z8?tr zEPU>+v_R-iWQyHHySoyAP?xk*JVcTdA)P_-2-dU6t;NtQ%F<;VV^z<`c<-)w2{}fr z|N402J*AW|3sGYKbeh6;u(?uscff2ZNY7iZS{>kDPbE=jGdH( zFR-mA#|9*96-(qZJ|$*>+DU& zb@Dps%f}XSp1RIB&s=Am&s=AmCvP*(Q@0uCncGaKXKv>iH}4x8$LBUNKDP0*W|3PaCt)BChYjIePuQjvZl*(OY~jO-{pRImp9|Pd@gde>}T(Ny1ww8 zcwds7R@uYW1at6yCm^e@cUqRsGvk@_UJZByR*Tg#$JP=7wuXSYi&&8A>i@;h;guRx z0O7S6bh-SB;tD7(FQ7M|l>Msm_-k)0H5^uFaP9d!Rkbx7W#y#HC4 ze_T8LrX@wq*{H7hoqBwpuL)*hy)zH%9Y3shT=4OO?PbuV8PE;rcj*ctw|>zCf16z~ z#qZUXW^YU>>jCNu<#2%cA`q}CN7=cuJD@bTpf7al3qSY*ed+?f2Eax@^lO6)dsKrD3!99=ucOt61^p0ABB#bUa$2iC_s z1164tpmXDVQ18F-m!HyZ!PvX_OWCcEP~F`{8T0HEC9Q(O&xMfL*1I4t`# zY6st8joCrdY`&xH+}ETj(9aZ?-ADZ{gP)MEiSxD8z=wVpe16WS*1(5;7ktf}ueAm~ z^gHF_es}RP=!54E0xyoIFrcqNW8(>e@aBe?;`3^3JZU~^SNX7y+12N>$w%3_ywj&N zH~AFVu2H-CxLw=0-rH-?yV9;9&KIeHuhOpDINy$Y@+tIr>kljJc@@Q0f0UiqH^H>u zpeXgwk2bSw+0Ell#lyym+s(!b*tP7I=~!{Xn37uIvs+%@@0GQFFY$hlG58i@W(S$2 zF|^ZS2r;xzbMbiKlHae(%`gUiJ{l{_#I0bg4A@{XbT2cM?ah934$9+u&_&#hSPsyG z=H&uEp_&VYcjFi>cMe;%jj{=v0aV59-sAjF*zhSTUK@ z)sEz99N)BmEkB^byK-Y3pNK6d46|f(>y}}FJ_j`GK+0_G?TeTC`eMbtK0Wlf&fl z#Y94H-3K~th~pEn&uv*Jh~qca3F3IrsdQg`8aL{*A&yVP%An5)0&)D7>;!TAraD0! z4?0aE*M0T!ZqRGNmWZ{AaSOIr_Zueo>;b;(Gq>COW;W_$n5}kWCM4o`@R8b+Fu`wN zFX#)@f2nvWm5LQpDLvGiGQ;-NOr`Ha`x2&VPe2@hY0*JSi;kkq;&{yoDY~=oJ9LK5 zc7{^I=`7m1^=&+^4;}0_b=gJZhEP&=?H1?VvMVOeQ?hH1I6oxASKI%;<9dxh|4xok zj=MPO9Cve!aooePo#S4PagL91OmN)C(crkBW0GSB$4-u29J@L8a6G{AAjcHPUXFbn zALn?87EHB;-Ftzk|Lh^O-@ieO-@rhi#!&&yy${_$iHGfq{=R=ox3ZogglG+ ziW9RC|3h1FJw$T~vplCT8-=_8m76rTFw1ibm!FhS7bri9xr(PR^BlwF0SR?e37Mq% zh0A$KL;h=X66y-_f{x{{y)3l@p1$^~gt~+LLISSOUi*%e#PTZ43K4`v3e}OurZ2iqf`l zZbtvlyv6j)zr*xA{|>X=3-2)9X5L}8%fHKXI{z-y>4kThPBZU*hqcAW5c)d*9@Fpq zdq0yl&i`&P|GUxuFTBTepLvh_9eCK7o=O`6tKKfMYHHuW#2=kSG839seH8KZW!F&p$Qp4MF}XZqCMGE$d9g=lPm- z7QWw_hwry=E#3$p6?DP;Q!60mpHx?Z#>*{`9By?PirSn|TAK}}oCs52cn*h|FT!D) z=O{b(>I)TofzOEfCodr8pU|&f7x)bP>TQlF<#2-XsfQDcFOjgRN7=c$ zJ)wB_MHDZ5&w{@2a6ZgGAs^*D#RkS=YLO^;af!uN>McKDvzX45R2&8(};KiO=$Vh7n?n~C`+NMreE{>g!H_-APT zN%AQ~hWwKqo*(m15+LTE>l|^H0c!`KL$aL%$=R%KhHT#~?mG=AV!s^H0c+K5c=; zLalbi{1ftG{^?QmuCy!WpO7z717D?GG5>^on19-+-+l5=PKEY4>_@_`61QtTw`*Ay zVOX1@liZMsQqgrQd5@Umf*rg#O-G+3&ZeFTQ(3zuL+9cGtjH z>7OX)i`Bqa>7QMkZ%++;oB0R(eLL{t_YMipPy0RiM`;Yu`9x`k@01+(nOB`MBqBG? zCwsZx`)bg;(ynpNXVkz~Y1hX%-~Jl-HnVFvDEhsd_4^=yM_|7Xuzv4&$bO&X`gWtf zUg-DU+WLJ5=j*9~uhKuAobNyle3ky`;(P~d;M>eU*zZ%o>i|5&`9;5n-*?e|?`U>E zu-}6ZH@ruCx!xbDLGMbt_Hn)^YT&E1>*Jj7!!_`2X4kT>s^9xOYxBJae)a&18V9VQ zY}??oUAo77k1@u@U)o@ul^=XQu)zm?;DbJJ-##GrJl)_Y4TK%| z6Er;P8IWVRVF?acb0gzV`u}owPyRAENR73R-8Uku;L< zJAAF(EEFcE`u6XK3v+tF7@Qc}e=;+aofxP4NG8T6rwiHrqoe1?_D$v|K9wCTX!(i3 zeFqI-&^iw!JG!-l1F2*()6v^~;6P_*??9^iU~lGNYA6do=kM+s8tm#C?8*%E4)p9L z$8+Pb&#F&Dg^nLT!qw2)_m57Dk7V=d$xOk-Jt*m^iFCn)Ju3Ohv8nW6W;{JLJvlj% zFF=XGbD5Fc_z2w3riZcv(!fu)AbpDh+xL=SH(r`=|1Q>olhq525b8 zt+^_N*907E`#Qu8=1(UX!c7_#$2NNIBt^0RkKc=P`Gu;8>@5+zp!sT&+}aVo2v4k;c}m`zWhxtx2kx)%H?|J`ttwC z%#BTtK^b@x;I>498}$mA0YDR)VP_0U`ct_A)X0sK zbZ$5e+h?=sj*f%9=_E-H45Y^=(sVLE*vW2>4~e^ju;DH@4*O(>Cq_pno*?-EcdWmCu;O-CvLg-rmCdE9Zg;J_odV@nmBrfK>u+;q>9gQ%(4~Sr1*M-e;q8(h z$iizV-38hm5TYzRAs19usaCgl9b<2`J{{|=*ITVn$9ik@f=26}r(?Z2dRNt3)3^gn z!k$8?jFJkSdqKx7?gBN&r^nK`&+_4F;1=tLsxgqw$=j7sOJXABlRGt-&lCpDtyhJ405`o-6~Q{V?e*a*=d**;baU{-RfGNQ zIjCBI#o6d6!F{rm({yX`YHz5LQGZv-9?F?p*Y5$FP}4^~`NxMp_R){5pR6C@3SXDF zEx+y1vj~;0~yZuU1iC#47?tydv78%L{P~p{rn%Ru0OKAes%fk>yuEtT80dwea5!-x4LWwZdJ>$ z^j7Pi=k?(yLF*1tpRJ$QZi%)8ej08G->;Q|AJD*WhVZa>M?-t#l2@2h^#2mbgze#U j_4v&pcFN+FRq=*ry#}}LRsZsbSe+%&(K&_CTQL4FshjGh diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor-dyn=False.co new file mode 100755 index 0000000000000000000000000000000000000000..e84334bca7b4fa69107359afc92a5ab48805edac GIT binary patch literal 10848 zcmeHNTWlLwdOjQuhod1y(iBC}G)<4SV_MnvukYH=C%oTH^HJs(iW1i&&h)k6o`SK^-JH{hwWvNC$s;V za}G_>vLls`%@Ic5IsZBTcmDbRb0ZI5Jvsjf=WxvNh$JVvi5xVKVUFhI1j)?1uxtkP z!T(*zj|327-8oWy2YyS*7>cwV7PCI7XcKJEh73`FRAW6b0f7M|?e4-5Ei>%U!?vQm zwEn@(Sf)kfq?s5uix{Br8wY8l%v+q9GQ`yAOjwI{KTIw{5e9aztIaBv#P6SjY`qTS1Xiom+Hlu{GO^;velnAexvG{OvR{HwQ5;U8}DcHrIeokKrJmUR*f2ZPp#(u!Vo`J z&*}NPp`FbYGruNHv{Zd@(Wra}f!HfSpjU!OuLR(#-h439E5T^51T(!7%zY=pDjPD7 zlBRs;kohq6l4cK}cB}b&wfbDmR!a4D=b`#3DOAh3e4b2A^nqG^sgNq=Zyefwhw9nPXKKBeTP#%y_6umY-@x%Z zwO%!(gg{5AkVR@rCs%DRr9hTlEF~oUvFbK?JZX=YCC>lo~kSsC^El zY`S;|06(zS96FR;d~tSioGm@fxemIX!Sb|a!s}+x39A_0Kfd*oX7>Z^atg?z3-TNh zqi3-kNrFjv)C2`*-gpj+`jwK z_M1YakfxcrvX-vrGTM>Uw4O;%%^pdO&*-!ICL?xOP_wz(QGI-3?8sPpY;7c? zPESr}#-}Dzsq~DVJ~A~mu{b_9Ha$DB755n;T%dIp-9NW#It0Qe9KS^ka3I?+w`#W& zndK135rY#paMu=K1QR%g4H=gKM{t=JHb!w7a0ZuM!p0oHoP*nV6!1LaWCya=9j^PA z))Ak0b>qUi$a5P|h2G$WM1vP;{qPdk8v3h`KlQagi(5(8}(MO()vG z!x8Hao%dh#2c&fe`sq4C%j-k0`7e6?QLEqDvl3}~FG^mo<+1u#TIlsP!Ob~Wc}{kr z;Pq7)zzz>*u5g@!tc7nJDC)8ZI#zhE>_&m>%h2YhKA$4^z2=J7Yqt|nRGfo$fP&NS zGgqJ<>$_3356c1FW&efsmLw%wtrlu=96_JwErcRS^hB*bu&srDxrRr%fPS&&M%UI{ z7vdM2D3oY~!pa3nQam2Xya2X&oSeCe!m<^Pn9WF3u_6)KI~YkeBauWS5>fhhM@+9X zvh+?kY&OGC9}df&!BDap!dxMxe=uZ9&d^c|{jJ?!6uq;y2mSS03;hk0BPeVM>-$k- z-9qoKy}^eQRyb-lqcO#bM&$+{m#w&L_U|7~Hiw53jp1R%_k&@xh2C4kle7_!oBjKu z$!0W~XhfrmZ%-8J-UnU>BK80ewr9VbY|3(?AYAaVj~ten{h?4;&HGiAzO(N(|=zw*-R!Ajbu^@{*c+jp++KM z`VYpF&3HTkwkg4V%pNCd#bV|e7&FaST(M#?*>|6kY${5kp(u*~0JF)7dAf4my)dcg8Jl`knEmAR4ge;CYCV??B9i zxPiZ5EZFfINj67D5{;1&C3xS68FG#+b;d5ZH)i^ru_X(gUlUMsO=yT0QQ-7VSHuld z`}9pm%ub`wp`F{UUs{yYZC;lgL4l zqv)i_Q*==jD7q<%6#FPj6g?EZ6#FUqDEcV|DDI*dq&Pq^L~)Q}nBow{2*uqLqZId0 zj8WW6F-~zG#g-sg@cyud5&za<-9u>Yh8J%P)YnWV#fS|1-ImE~H+IQz1YrIIpEG&C zf>5XbnLGd+OlK^Z+ym=$XI${UfY9sk+`2J{&?pjZoJ8`i8}M;=294PG02;LsKBn7v z0_$1mXKQ`$Nb5htbFAzA2=A>Ma0Gf3S^xFbjLz|x8_M%MlpoN8ya+g_5A(P$A3_UW zD4)>B`2cR8JI>?2LH<+-x6hs8Be;Do#baAQ{!9$F&sBKLf7Aeeunpv=G5x=L()<|^SieU;dF>N3~v3&$99J$;#QJ$o7aK<65^_qof2^VBuM`Sdly z`Rp~q`P{V%;i6;X&iUzK^V7p(PsR$BD`Hzt#tM~>+?JEELgn|19_F{5j+N!UKC9kZ zS-1Ck^mBW^3Zg4(1L*3SYbAQI6(swxao!LalvAAPp&ITl{RXR7}U zhdIwllJ^XRC21%G?So5RGg1pOf;rd*Jg^J_-HD zb?=@e@&NkwCDC~ys06^@uMdMS_6JQ%95$DS6*=fulB@JAungy%RXFDa;hfV4PY`@B zf-M7pUcjJJkq~s|1Fz?qPOenlGaC#04VvST3~3z1laLY#Dxlr204JU#dC?t^ zz&?q03U=I`5GTyHi}DTjz=z|`_=1!#+yftuJL4Omd_z6(;kaYIAdUL~orC!OBa}Zz z`R(}>gjn|I*WHwNZx42N`ZY@V;yv(n`gITG+qWg3WRD})F^P;LN#a~5N#5m_>@kPO z5%Vig{pzBAZF$*T@sYXW^^&>5!(8!l_FQqpoN^4nGstZ*?(v@U@;43`_X3Q23t6zI zEyEeg`u_Re2j7o@|L0?X-I?8_84ny;ak9Sy1hnrO0pG+IUXWmmwo{$r|J-VoaCVN? z!?I*2la^`mHo#?>oW|D!GX1Jr9v@dn)lv8!RUR>GwQ}|7;lofc>nSx|DjYtgXN*$O zzD6h&%JrIYIG;aPI8?5bo;T7pxl&3WnpS~99-kT=o0O+hGoz#W*zDxg)cE*pYG!hJ zR-c~97-O?zlM|Wr#6)^RPtB%|+=u3KMYyLsDr1A@=TA^GJ2Spr0Q8MS6nP{)g6?&c=|3Gb5xwv}6n&=>(O-R~ehqD^$Ko2!(&R zi~Nswktg9Wi~H^7$=*f&{$1q%dKY=jzLdF>fAlW$6L*n+f#sZt5E(FSbDshER`*S- zAm8e~=~FD<;=T!~#Zt{cD%`8Vg&k6}#ky+hRTI(wSYE3jwNlFHH65un(n6^kwRjWuP*Y|3=N;GH@-_ybEWUf#zKpR+z@U1Aqw}dr41!zJyBu5Zw z&*y5Okt?D!+}`DiaC5&{%I8Zj;GVQpN-LH$dp(($Kw555gC7VO+StVCgf_al`bbyx z*yif#uIlm4)w5mI&{;>{lR)-@2@8^zGGOJ>CZ;B|NhUIi87Amdf%YXdvVaRAHy;8{;JLF-HFBab?~`#ddE8U{dhH9(Q9dQ zC(1A_@b$B8DEJ9qK<`w&Vx;T#W&KVS;U_COP_Ds#na?A9YhA9}m;LSW(IvCv-zD2& zXm?`cz3+py6F>g5V~;=faC`Oozi04rj*mk2vJ2S#JEM2mfr)m}RkU^S9?taHIgQ~N zs)x^MxGz|F?ZnP+eklF1)haQ4c1~l+_A6G;^eK(~^8n1v>e;!DVYfcBpCR5>VJUCx zvvVXvc8+BI+1np003L(~%UJ(&)If$7)ODu0&I05Q!-MtNIhXz3q5bf(HN+jCR~S8xJ{Y@ScZ0k-W} v>~9Jpw-xEXPxh0{)lMPgS-(tt2{K#hKQ>HS{Bu`jxBmYE#(28t literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_2stage_gateup-weight_dtype=torch.float8_e4m3fnuz-TOPK=8-K=4096-N=384-BLOCK_TILE_SIZE_M=64-BLOCK_TILE_SIZE_N=128-quant_type_w=QuantType.per_Tensor.co deleted file mode 100755 index 9efb71a67d7bc9d719f1852920b0d50fb42828f1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11264 zcmeHNZ)_V!b{{R5%jJ?HDT-Q?G)=FSHLdVsT7NCc4FcJ*Q(vJZsxJ-F)XjQHuC$CP zlHec5Ii$);;)!>GVV#XjVdu`M@7i4M65L&a;))y!NJ3Gx_o4k@mQ5fx{NIE8NB}X`og&o_;jbwfLy@+_V%8@WZGtJksRFO!Yo~fiz@D)gJha>Tyu9lm1x0%rEEuxtg{8 zxkEddxxWi~xWC;d+!uZ(u)$m41sWuVw1`~L_N!1(08c)D`nP9(5259s>hoE%qNgqz z<${r~T>D29{3xZ*mC9PCSSwVO_fvX#uJT^}rzt&~EgO}JRw?Njow~zp2^rZ?!u8s8+v7&6SI_l2$S1=8ZyCo6TJ|vKsc`-(Ns~ zVUI`{d#!3cpPMU)KYvJ$70&)RmDdY%HGR(b(TgRc@a(km>?bXecF8DLa>c@R4t<)M z*Dq_cdA+K=Q7m7i2H-Av7{dj9-ncuUy*e~LUo^DgN)`CDIlXGsN;8+frm=rBvtJw3 za=E`umCCtGP}?%`!5g5DQkB*uN_|2Kl~OLBC-Vw@kgB{npDyO_er2cKr7mVaP1Opy z*S?Ue_k}+E@g7>i- zr+z|LgDN=QsQv;z4p;%%zCl%A#&RSfwqCRK5+i$}kiURL*ikrJ_UeH@-1o}QU;Z1{^@SJldWRou zlLMZulFhkO&!e3-gb*Q3Gjk;^Q_E$wk=YSrTp!9z4j(s0#|L$4NNO%uJu#8h(?etF ziLt>+eROPi`1r)o*wEzA*yQl!;ACc2&+3!ebb54rE8{N^;aOHk{mF{$xDN_1cKr<1 z!4Yh~YOnwQn76uw4cl%`*ud{(0oq%*L)Zx8GN6JR@WRFbE&~qYvQyZY0GR6HHl_eA zC%17La0PKn7qZs7oT-I%#3kO|xUw$SuE$ z`g(XZ+zi`#VJp1a42Owc_=dF{4zGL%%3M9N8fixCHW4eb+KfaxfUR@??REn4ic`=IP;mHt<`UFn zeHZe&u^iA<@?TkRN>ZZPY@((p67+emBNRrWSG3%g8|?X$HN1KV=zD7}bZgCdC4RMm zLWz1vRhVbC6|l|Y7R_ZOD-bNR5ss=>IIMX4!pTNBoT!JxYIh`TdL7|~+p=sn zWT=;A#nTr`HbR&yq;~g(Ovw>iXre#0`xDXaH8=YHS`+;M%3-w664npFShLWdt*vr$ z!jhwABN|h!XjG|lam9)&X7{1~WTU@7QSa|peNXh8P4xa6;t{IH<7W54XtEKFChF0s z>f0ZMx*q~>7b5lm54Pu!l58kSqOK^a?~np*J^=kL375zf~eP?gDVgt-{F`EaRYzBSg_+a zkZcSLB)iVlTxI#XgEM#eEdR6eAR)6!%k%Q9M8~PVpec zrXX1G{;-A-|JGpLLul=;7jKZ%*GwkGhz$GPmMLp@dlZD)ea__FDnjl4XYv4SwC%BA za?eu;wZ{eT7YMxr&;7gbvwskYHjX0Y{@oD#q(uWZK8*%#gg;x@cpB?j=v!;<+tT{C z@Eq%SKf-(et{gQGeXh)7{u2i9gKZ!`kNKZ52>&UgpJ(rg`JXd}dD|D% z_qeYU^QU;+$20TicoqxX*Qxn5kNbFTp7b$ujpH%*OVrte%$R?S#-OtT61aL-s{nK?foi< zZmjj9n`_Rc=+$PB?ECilA~180=LM_jBWHpo*xwPzoKC^AINySANw^vkLRO#Ew?xho zA##R*wF_DBC#|W^KZ7kbECIr{8FssalH?9aZa<(uBsGJw^jjZ(y6!nCFW@ugf5#eb zr;rEGckmQi5rS#}*8O+l;EO{+)8hNh#eP)@y42({JrgX#d1o2UJ3%<_xZw$c??tes z7tjkBbgL3V?|t3_|M$8RQqZqTz5ayMR6}+wq>~{Mi%^J@&XBxxI3)G<;aI44EP@aV z98)*&c>ucsaa=uah#ADy+c@-;z(X?c-01#6D5v_%kHW_lfW>qQrxJI2JZvM=apXZZ7ho!n=gzvH)jz zK~NWYKC#U6s>8{fZg`pjft;ZQF@U@X-3SC==XmdP4t(DX1OgWP(B^yKoXiCh^*~VN zdqq=XX9qX~gB|7)$0-)anO!}BfcPO0;KVZ|FS-H}*av

v_U(ZW#~t$pY21719K`P*ru;F=Z_lTo z{kv>WevMGx1ADN$-LFx~7vBS4yI=QHzJpuxN%lB$o|MQqk|fS~hUA?t$sTif95KK0 z)UQtJ*QS@v6(5-^UN4y|Jj@j@97AA`yI@Xnz3}w$Ta0_W=e+d=2jgCVac?0D_O?Yh zOIcq&U;5z7G4SOW*wGkZcaAq`#)C-0KMirR?`i?h;tNAau$9}XO!2?&H;Xv?YpM^+ zm!04bb6Y34Y*aG%enz491`c01q{mY@xh@{WjsAGIH(UzjvhZgJUp467#*L~ z$0xD|{5gGeB%2u-$&Bdf$@JKhXgXJbd(IOIHfVbKG&MsRIF>IK=8Upd(yJ!E1JWu* zt!ly@QMoi<(K32LtJF%RVz~+}G8gr^TwxCC4J~V=YjawzFk95%Rz}m|cBfd;z|PrR z-l!a_lrx(ww+p*4cWZC9q_AB;dT_78Do8wCD2p#i+R}bI??QEU|4!?YRDQXm{Unv& z?vPJY`S&{He?a9`{P%>mbkWYc(65LAt(vb>xp<%@vNP{c`CA=4H>vzaNBbXBd7wl7 zDU}bxZ;AHwODezJ(f%K({C|}8)(HIzl@C0zS^gU;=MQa`!x+sJ>jjr=&=Ep65B?`$J~Z5#PNVL2zF zsb@C7K^f#*-5-7z?fiNXyM?@EL}o4GoP?YJ*5ir?o;+vlpe25mH^qvf2z> z+~x}Ky~S)XpD(_F?b@Vpyp3;Ua7Y{4Ts_fIJ-n%U6nbx2gSDaiHfu(q^``34&DBtR z!-Qo}OB=9WY9pg#+87fV@CsDW&N#mFi`GJIUc;B$yVJnmwd_hGZItmR3A@PBQrv@Jt+xM88f9>?|zi{&TuRgnZ zW&6LC@bZt3TJ~}b*!dmU1$H2#^c_W87oQQBK0608yg>EvIS}_H(lR?A!r#=a;& zrq9lS4B0-$>X|;Ju|EUB+^n9R8yR-$Gy54H1YKC-Tl(yr%8;E?S%3C+$qIl+;K4H1 z{{}s`GPIzsJx#S2Ab$)Vtk2HT?7J5BePio8MyA(cz!=rV?|>8A@7Q;a_>K=Jb`EF$ z3lK5F+as1S{g3H>$-Z}->FBSch@S7%e}ND){m%BC<#W)yQ-&FsK4E+Ce`i}Asyb!3 z^-lFKQGK`?+?1I9WCuAbGJG4fSnS&geuW1Xq_fnq*xw2~%>QisH$cXG57}0lc!Fff aIkhEz=v}Wt)mHjTud{lwBjy$k@=-Fxk++*upF^$;i^~2w%p@r(*&;F7)XdB{&B)Nu+|qdR zMMmk(92_0|jCzwNii=CeGl0Oy{{mndK}?u@P+Xkx*yN8ul3}u*=!iG!iR6k#fe)ie1aP<9JYR*;#Efq?@e>j0q`?oSq! hyv<}{I$42TW3#B#C#KCDvJaS<002|(Gq(T$ delta 278 zcmeBh>Tud{lwHFjB{9j+G|9r$z%tRq)X2!p!qC*v($Lh>$kM+suflwBjy$k@=-Fxk++*upF^$;i^~2w%p@r(*&;F7)XdB{&B)Nu+|qbr KqwppU4tW60gAaWG delta 55 zcmeD1>+suflwHFjB{9j+G|9r$z%tRq)X2!p!qC*v($Lh>$kM%k)?rUa#~_aqGd`_l8O1m LMv={q9PQEo*U=DP diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co index 7a79468fd45e7f13ba8e60a653f12fd37703c0c9..954345ac16a2c6437a9cc8b51c892caaa8d586b6 100755 GIT binary patch delta 110 zcmX@%e!_hND~Cp+k+GqvVX~otv4vS;l98pkiFt~VnMqPovPELDshOE^nvtQQxux;M zMv={q9Iu6$j7=u%$r#BQn=|C6q{L^(C*|ZPXEVenCB^6E$LA&Hrp6l@TTZT%(PJ_( Onk*ozv-y_HJ2n9MyCGZv delta 110 zcmX@%e!_hND~E%k)?rUa#~_aqGd`_l8O1m zMv={q9Iu6$OiU;1$r#C*STN+Lq{L^(C*|ZPXEVenCB^6E$LA&Hrp6nZ8ceQ}(PJ_- Oo-81%v-y_HJ2n6qeId*M diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Tensor.co index 667675008a59b20f7437b736829a6f42199e00b8..0f8933247eb47b49b3c52700fca3c0494337fdbf 100755 GIT binary patch delta 57 zcmX@$e871_E4xObk+GqvVX~otv4vS;l98pkiFt~VnMqPovPELDshOE^nvtQQxux;s Ni;U8nPq6=x1OWNv5u^YB delta 57 zcmX@$e871_E4zk8N@9|sX_AGhfn}nJsgaSHg`ugTrJ<>%k)?rUa#~_aqGd`_l8O1` Ni;U8nPq6=x1ONcu5)}Xd diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=False-quant_type_str=per_Token.co index 4f8595aa3b1763e34a3fa15e7c8a64752235cf18..ae88ad1399e44d26b8f08214f28e134ffc9aa4c7 100755 GIT binary patch delta 83 zcmeBh>2TT5%C3=UWNc_^m~3cZY+;s|WMpY>VxD4TW|EYYY>}93YG!7fW@Kn+ZfQLE nBBS)?6YQ+QOh#st&r3-$8CgtzEv3g~Y&bbkR%Nrj^hq`V>5LcF delta 83 zcmeBh>2TT5%C2FNl9*&@nq*;WV3}xQYGh<)VQ6Y-X=rL`WNBcToR*l9Xql3fWMV$~ nBBS)?6YQ+QOvdJu&r3-$8Cy<%Ev3g~Vl+8WR%Nrj^hq`V{8<=H diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Tensor.co index 3ce116e50612a9202ea69f635b3f9ad379333aa9..b7670ee71bb06d2310623ff9aea8d4af51e10bd0 100755 GIT binary patch delta 56 zcmX@$e871_E4xObk+GqvVX~otv4vS;l98pkiFt~VnMqPovPELDshOE^nvtQQxux;M MMv=`2+5bxd0O)ZMN&o-= delta 56 zcmX@$e871_E4zk8N@9|sX_AGhfn}nJsgaSHg`ugTrJ<>%k)?rUa#~_aqGd`_l8O1m MMv=`2+5bxd0PgA$y8r+H diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_batch1-weight_dtype=torch.float8_e4m3fnuz-with_silu=True-quant_type_str=per_Token.co index 5026c34bdc99797711555c07c62ea77eab7069df..301183d02ed16e0695f32be74073ba36dcfb90dc 100755 GIT binary patch delta 82 zcmeBh>2TT5%C3=UWNc_^m~3cZY+;s|WMpY>VxD4TW|EYYY>}93YG!7fW@Kn+ZfQKR mQDpN$b{=6SL-Wb!rKFe)EhoR0(ql3*nj9#rve{nxBpU$FUl)7; delta 82 zcmeBh>2TT5%C2FNl9*&@nq*;WV3}xQYGh<)VQ6Y-X=rL`WNBcToR*l9Xql3fWMV$C mQDpN$b{=6SBa6xBrKFgQ4JN;q(ql3hianNzFg$5{LGBYtXF=H_>GdMP3Vlp{5G&f{2HZ)>lV>n@BH8wUgW->7` lH#swt(E%H?$OgbF0ysC5iZMX}F_Yj0CX-My1he)r+6L{G8JYk9 delta 79 zcmV-V0I>hianNzFg$5`%WMN`4HDWk5Fgal~H8L_bI59ObIWaXkGC43gV`gDwVL4=C lVl+3C(E%H?$OgbF0y#93iZMX}IFsN7CX-My1he)r+6Mk{8V~>g diff --git a/hsa/gfx942/fmoe_asmjit/moe_gemm_final_reduce_bf16-TOPK=8-OC=4096.co b/hsa/gfx942/fmoe_asmjit/moe_gemm_final_reduce_bf16-TOPK=8-OC=4096.co index 2e45d25f3e7fc121ce46b68be8981da69bc36c72..6cf62637fb15e43e83ee94c2465182acf4ffa3d2 100755 GIT binary patch delta 79 zcmV-V0I>hiWzc1?g$5{LGBYtXF=H_>GdMP3Vlp{5G&f{2HZ)>lV>n@BH8wUgW->7` lH#swt(E%H?$OgY80yR04i7P+>Gn1e#CzBv71he%k+Xmu286f}w delta 79 zcmV-V0I>hiWzc1?g$5`%WMN`4HDWk5Fgal~H8L_bI59ObIWaXkGC43gV`gDwVL4=C lVl+3C(E%H?$OgY80yZ_1i7P+>Ig_9*CzBv71he%k+XnLX8I=G4