Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions aiter/aot/flydsl/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,34 @@ def _row_swiglu_limit(row: dict[str, str]) -> float:
return _parse_optional_float(row.get("swiglu_limit"), "swiglu_limit") or 0.0


def _row_optional_int(row: dict[str, str], name: str) -> int:
value = row.get(name)
if value is None:
return 0
value = str(value).strip()
if value == "":
return 0
try:
return int(value)
except ValueError as e:
raise ValueError(f"{name} must be an int, got {value!r}") from e


def _row_optional_bool(row: dict[str, str], name: str) -> bool | None:
value = row.get(name)
if value is None:
return None
value = str(value).strip()
if value == "":
return None
value_lower = value.lower()
if value_lower in ("1", "true", "t", "yes", "y"):
return True
if value_lower in ("0", "false", "f", "no", "n"):
return False
raise ValueError(f"{name} must be a bool, got {value!r}")


def parse_csv(csv_path: str):
"""Parse the CSV and return a list of unique compile jobs.

Expand All @@ -93,6 +121,8 @@ def parse_csv(csv_path: str):
experts = int(row["expert"])
topk = int(row["topk"])
doweight_stage1 = bool(int(row.get("doweight_stage1", "0")))
hidden_pad = _row_optional_int(row, "hidden_pad")
intermediate_pad = _row_optional_int(row, "intermediate_pad")
cu_num = int(row.get("cu_num", "0"))
block_m = int(row.get("block_m", "0") or "0")
act_type = row.get("act_type", "")
Expand All @@ -101,19 +131,8 @@ def parse_csv(csv_path: str):
if act_type.strip().split(".")[-1].lower() == "swiglu"
else "silu"
)
q_type = row.get("q_type", "")
dtype = row.get("dtype", "")
q_dtype_w = row.get("q_dtype_w", "")
swiglu_limit = _row_swiglu_limit(row)
# Cover both runtime bias choices for fp4-weight MoE. Model configs
# share kernel families, and runtime bias selection can vary by
# activation dtype/model semantics.
bias_supported = (
q_type.strip().split(".")[-1] == "per_1x32"
and dtype in ("torch.bfloat16", "torch.float16")
and "float4_e2m1fn_x2" in q_dtype_w
)
enable_bias_options = [False, True] if bias_supported else [False]
enable_bias_options = [bool(_row_optional_bool(row, "bias"))]

# Detect stage1's fuse_quant from kernel suffix to align stage2's
# a2_scale shape with what runtime actually passes.
Expand Down Expand Up @@ -143,6 +162,8 @@ def parse_csv(csv_path: str):
"experts": experts,
"topk": topk,
"doweight_stage1": doweight_stage1,
"hidden_pad": hidden_pad,
"intermediate_pad": intermediate_pad,
"cu_num": cu_num,
"act": act,
"enable_bias": enable_bias,
Expand Down Expand Up @@ -200,6 +221,8 @@ def _precompile_to_cache(
enable_bias: bool = False,
stage1_fuse_quant=None,
swiglu_limit: float = 0.0,
hidden_pad: int = 0,
intermediate_pad: int = 0,
**kwargs,
):
"""Trigger MLIR compilation by calling the runtime stage1/stage2 entry points
Expand Down Expand Up @@ -554,6 +577,8 @@ def _make_a_user(a_dtype_user_shape):
a_scale_one=a_scale_one,
xcd_swizzle=xcd_swizzle,
swiglu_limit=swiglu_limit,
model_dim_pad=hidden_pad,
inter_dim_pad=intermediate_pad,
)
_run_compiled(exe, args)

Expand Down Expand Up @@ -723,6 +748,8 @@ def _make_a_user(a_dtype_user_shape):
b_nt=b_nt,
xcd_swizzle=xcd_swizzle,
enable_bias=enable_bias,
model_dim_pad=hidden_pad,
inter_dim_pad=intermediate_pad,
)
_run_compiled(exe, args)

Expand All @@ -734,6 +761,8 @@ def compile_one_config(
experts: int,
topk: int,
cu_num: int = 0,
hidden_pad: int = 0,
intermediate_pad: int = 0,
**kwargs,
) -> dict:
"""Compile one MoE kernel configuration and save to cache.
Expand All @@ -747,6 +776,8 @@ def compile_one_config(
shape_str = (
f"{kernel_name} "
f"model_dim={model_dim} inter_dim={inter_dim} "
f"hidden_pad={hidden_pad} "
f"intermediate_pad={intermediate_pad} "
f"E={experts} topk={topk}"
)
result = {
Expand All @@ -769,6 +800,8 @@ def compile_one_config(
experts=experts,
topk=topk,
cu_num=cu_num,
hidden_pad=hidden_pad,
intermediate_pad=intermediate_pad,
**kwargs,
)
elapsed = time.time() - t0
Expand Down
50 changes: 25 additions & 25 deletions aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,tflops,bw,_tag
256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,182.57,11418.01,
256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,356.87,11166.78,
256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,703.24,11016.73,
256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,1052.15,8262.71,
256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,1508.93,5955.64,
256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,948.24,1890.61,
256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,1384.38,1408.26,
256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,1665.09,880.78,
256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,181.84,11379.53,
256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,346.54,10857.58,
256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,643.71,10110.43,
256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,956.4,7549.71,
256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,1246.25,4969.6,
256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,1520.68,3093.83,
256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,1769.69,1872.23,
256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,1943.37,1107.06,
256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,144.56,9070.37,
256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,266.72,8400.06,
256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,473.78,7518.51,
256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,699.93,5639.12,
256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,897.88,3726.57,
256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,1082.99,2379.61,
256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,1257.9,1535.52,
256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,1397.92,1023.87,
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,bias,hidden_pad,intermediate_pad,tflops,bw,_tag
256,256,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,208.4462,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.1431,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_bnt2_persist,5.0%,317.5893,0,0,True,192,192,182.57,11418.01,
256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,215.3665,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,109.5792,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2,5.1%,324.9457,0,0,True,192,192,356.87,11166.78,
256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,208.4256,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,121.3742,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_atomic_bnt2,5.1%,329.7998,0,0,True,192,192,703.24,11016.73,
256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,289.0776,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0,0.0%,151.789,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,440.8666,0,0,True,192,192,1052.15,8262.71,
256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,404.0325,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,210.7843,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.1%,614.8168,0,0,True,192,192,1508.93,5955.64,
256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1319.2527,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4_bnt0,0.0%,637.455,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic,5.0%,1956.7077,0,0,True,192,192,948.24,1890.61,
256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1739.6561,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,940.8593,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.1%,2680.5154,0,0,True,192,192,1384.38,1408.26,
256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1991.1671,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,2466.0812,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,5.0%,4457.2483,0,0,True,192,192,1665.09,880.78,
256,256,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,98.4171,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,61.0182,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,159.4353,0,0,True,192,96,181.84,11379.53,
256,512,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,102.0754,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_fp4,0.9%,65.2416,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_bnt2_persist,2.0%,167.317,0,0,True,192,96,346.54,10857.58,
256,1024,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,108.8592,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,71.2891,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,2.0%,180.1483,0,0,True,192,96,643.71,10110.43,
256,2048,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.6562,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,92.8451,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic,3.6%,242.5013,0,0,True,192,96,956.4,7549.71,
256,4096,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,229.8446,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2_bnt0,0.0%,142.3561,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,372.2007,0,0,True,192,96,1246.25,4969.6,
256,8192,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,340.00350000000003,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,270.0618,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,3.6%,610.0653,0,0,True,192,96,1520.68,3093.83,
256,16384,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,597.4298,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,451.0174,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1048.4472,0,0,True,192,96,1769.69,1872.23,
256,32768,3072,1536,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1065.1228,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0,0.0%,844.3695,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce,2.0%,1909.4923,0,0,True,192,96,1943.37,1107.06,
256,256,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,37.8528,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,28.9955,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,66.8483,0,0,True,192,152,144.56,9070.37,
256,512,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,41.4756,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,30.988,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,72.4636,0,0,True,192,152,266.72,8400.06,
256,1024,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,44.1622,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3,0.0%,37.4254,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,81.5876,0,0,True,192,152,473.78,7518.51,
256,2048,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,59.0778,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_bnt0,0.0%,51.3745,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce,0.6%,110.4523,0,0,True,192,152,699.93,5639.12,
256,4096,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,79.0359,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,93.1675,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.6%,172.2034,0,0,True,192,152,897.88,3726.57,
256,8192,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,124.6681,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,160.8729,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,0.6%,285.541,0,0,True,192,152,1082.99,2379.61,
256,16384,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,206.4827,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,285.1907,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,491.6734,0,0,True,192,152,1257.9,1535.52,
256,32768,3072,512,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,367.6363,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,517.2156,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4,0.6%,884.8519,0,0,True,192,152,1397.92,1023.87,
Loading
Loading