Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions aiter/ops/mha.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,13 +1299,15 @@ def is_fmha_v3_fp8():
ret = ret and (
q_descale is not None and k_descale is not None and v_descale is not None
)
# support per tensor and per head quant scale
ret = ret and (
pertensor_or_perhead = (
q_descale.shape == (1,) or q_descale.shape == (batch_size, nhead_k)
) and q_descale.shape == k_descale.shape and q_descale.shape == v_descale.shape
qkptph_vph = (
q_descale.shape == (batch_size, nhead_q, seqlen_q)
and k_descale.shape == (batch_size, nhead_k, seqlen_k)
and v_descale.shape in ((nhead_k,), (batch_size, nhead_k))
)
ret = ret and (
q_descale.shape == k_descale.shape and q_descale.shape == v_descale.shape
)
ret = ret and (pertensor_or_perhead or qkptph_vph)
return ret

def can_impl_fmha_v3_fwd():
Expand Down
6 changes: 4 additions & 2 deletions csrc/cpp_itfs/mha_fwd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ std::string get_kernel_name_key(const std::string& arch_id,
int hdim_v,
int mask_type,
int bf16_cvt,
int qscale_type,
bool mode,
const CFG* cfgs)
{
Expand All @@ -42,7 +43,7 @@ std::string get_kernel_name_key(const std::string& arch_id,
}

if(cfg.dtype == data_type && cfg.hdim_q == hdim_q && cfg.hdim_v == hdim_v &&
cfg.mask == mask_type && cfg.mode == mode)
cfg.mask == mask_type && cfg.qscale == qscale_type && cfg.mode == mode)
{
if(arch_id == "gfx950")
{
Expand Down Expand Up @@ -232,6 +233,7 @@ float fmha_fwd_v3(mha_fwd_args a, const ck_tile::stream_config& s)
a.hdim_v,
cfg_mask_type,
a.how_v3_bf16_cvt,
a.qscale_type,
a.is_group_mode,
fwd_cfgs);
auto it = fwd_cfgs->find(kernel_name_key);
Expand Down Expand Up @@ -374,7 +376,7 @@ float mha_fwd(mha_fwd_args args, const ck_tile::stream_config& s)
#endif

#if FAV2_ON
if(ret == -1 && !args.v3_api_check)
if(ret == -1 && !args.v3_api_check && args.qscale_type == 0)
{
ret = fmha_fwd_ck(args, s);
}
Expand Down
60 changes: 50 additions & 10 deletions csrc/py_itfs_cu/asm_mha_fwd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse,
ck_tile::index_t batch_stride_descale_k = 0;
ck_tile::index_t batch_stride_descale_v = 0;

constexpr int asm_qscale_pertensor = 0;
constexpr int asm_qscale_qkptph_vph = 1;
const bool use_qkptph_vph =
q_descale_.has_value() && q_descale_.value().dim() == 3;
int asm_qscale_type = use_qkptph_vph ? asm_qscale_qkptph_vph : asm_qscale_pertensor;

void *q_descale_ptr = nullptr;
void *k_descale_ptr = nullptr;
void *v_descale_ptr = nullptr;
Expand All @@ -104,30 +110,63 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse,
if (q_descale_.has_value()) {
auto q_descale = q_descale_.value();
CHECK_DEVICE(q_descale);
TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({1}) || q_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (q_descale.dim() == 2) {
if (use_qkptph_vph) {
TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({b, h, seqlen_q}),
"q_descale for qkptph_vph must be [batch, q_heads, seqlen_q]");
TORCH_CHECK(q_descale.stride(2) == 1,
"q_descale for qkptph_vph must be contiguous in token dimension");
batch_stride_descale_q = q_descale.stride(0);
nhead_stride_descale_q = q_descale.stride(1);
} else {
TORCH_CHECK(q_descale.sizes() == torch::IntArrayRef({1}) || q_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (q_descale.dim() == 2) {
batch_stride_descale_q = q_descale.stride(0);
nhead_stride_descale_q = q_descale.stride(1);
}
}
q_descale_ptr = q_descale.data_ptr();
}
if (k_descale_.has_value()) {
auto k_descale = k_descale_.value();
CHECK_DEVICE(k_descale);
TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({1}) || k_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (k_descale.dim() == 2) {
if (use_qkptph_vph) {
TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({b, h_k, seqlen_k}),
"k_descale for qkptph_vph must be [batch, kv_heads, seqlen_k]");
TORCH_CHECK(k_descale.stride(2) == 1,
"k_descale for qkptph_vph must be contiguous in token dimension");
batch_stride_descale_k = k_descale.stride(0);
nhead_stride_descale_k = k_descale.stride(1);
} else {
TORCH_CHECK(k_descale.sizes() == torch::IntArrayRef({1}) || k_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (k_descale.dim() == 2) {
batch_stride_descale_k = k_descale.stride(0);
nhead_stride_descale_k = k_descale.stride(1);
}
}
k_descale_ptr = k_descale.data_ptr();
}
if (v_descale_.has_value()) {
auto v_descale = v_descale_.value();
CHECK_DEVICE(v_descale);
TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({1}) || v_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (v_descale.dim() == 2) {
batch_stride_descale_v = v_descale.stride(0);
nhead_stride_descale_v = v_descale.stride(1);
if (use_qkptph_vph) {
TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({h_k}) ||
v_descale.sizes() == torch::IntArrayRef({b, h_k}),
"v_descale for qkptph_vph must be [kv_heads] or [batch, kv_heads]");
TORCH_CHECK(v_descale.stride(-1) == 1,
"v_descale for qkptph_vph must be contiguous in head dimension");
if (v_descale.dim() == 2) {
batch_stride_descale_v = v_descale.stride(0);
nhead_stride_descale_v = v_descale.stride(1);
} else {
batch_stride_descale_v = 0;
nhead_stride_descale_v = v_descale.stride(0);
}
} else {
TORCH_CHECK(v_descale.sizes() == torch::IntArrayRef({1}) || v_descale.sizes() == torch::IntArrayRef({b, h_k}));
if (v_descale.dim() == 2) {
batch_stride_descale_v = v_descale.stride(0);
nhead_stride_descale_v = v_descale.stride(1);
}
}
v_descale_ptr = v_descale.data_ptr();
}
Expand All @@ -139,7 +178,7 @@ mha_fwd_args get_asm_fmha_fwd_args(bool has_lse,
false, // is_group_mode
static_cast<int>(bias_type),
has_lse,
0, // qscale_type
asm_qscale_type,
false, //has_sink
q.data_ptr(),
k.data_ptr(),
Expand Down Expand Up @@ -312,7 +351,8 @@ std::vector<at::Tensor> fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d]
// H/t Daniel Haziza
const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k &&
window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_q % 8 == 0 &&
!alibi_slopes_.has_value() && !bias_.has_value();
!alibi_slopes_.has_value() && !bias_.has_value() &&
!(is_qkv_fp8 && q_descale_.has_value() && q_descale_.value().dim() == 3);
const int ngroups = num_heads / num_heads_k;
if (seqlenq_ngroups_swapped) {
q = q.reshape({batch_size, num_heads_k, ngroups, head_size_q}).transpose(1, 2);
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
62 changes: 33 additions & 29 deletions hsa/gfx942/fmha_v3_fwd/fmha_fwd.csv
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,ts_qo,ts_kv,knl_name,co_name
bf16,128,128,0,0,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtneE,fwd_hd128_bf16_rtne.co
bf16,128,128,0,0,1,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtnaE,fwd_hd128_bf16_rtna.co
bf16,128,128,0,0,2,256,32,_ZN5aiter23fmha_fwd_hd128_bf16_rtzE,fwd_hd128_bf16_rtz.co
bf16,128,128,2,0,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtneE,fwd_hd128_bf16_causal_rtne.co
bf16,128,128,2,0,1,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtnaE,fwd_hd128_bf16_causal_rtna.co
bf16,128,128,2,0,2,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_causal_rtzE,fwd_hd128_bf16_causal_rtz.co
bf16,128,128,0,1,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtne_groupE,fwd_hd128_bf16_rtne_group.co
bf16,128,128,0,1,1,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtna_groupE,fwd_hd128_bf16_rtna_group.co
bf16,128,128,0,1,2,256,32,_ZN5aiter29fmha_fwd_hd128_bf16_rtz_groupE,fwd_hd128_bf16_rtz_group.co
bf16,128,128,2,1,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtne_groupE,fwd_hd128_bf16_causal_rtne_group.co
bf16,128,128,2,1,1,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtna_groupE,fwd_hd128_bf16_causal_rtna_group.co
bf16,128,128,2,1,2,256,32,_ZN5aiter36fmha_fwd_hd128_bf16_causal_rtz_groupE,fwd_hd128_bf16_causal_rtz_group.co
bf16,192,128,0,0,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtneE,fwd_hd192x128_bf16_rtne.co
bf16,192,128,0,0,1,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtnaE,fwd_hd192x128_bf16_rtna.co
bf16,192,128,0,0,2,128,32,_ZN5aiter27fmha_fwd_hd192x128_bf16_rtzE,fwd_hd192x128_bf16_rtz.co
bf16,192,128,2,0,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtneE,fwd_hd192x128_bf16_causal_rtne.co
bf16,192,128,2,0,1,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtnaE,fwd_hd192x128_bf16_causal_rtna.co
bf16,192,128,2,0,2,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_causal_rtzE,fwd_hd192x128_bf16_causal_rtz.co
bf16,192,128,0,1,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtne_groupE,fwd_hd192x128_bf16_rtne_group.co
bf16,192,128,0,1,1,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtna_groupE,fwd_hd192x128_bf16_rtna_group.co
bf16,192,128,0,1,2,128,32,_ZN5aiter33fmha_fwd_hd192x128_bf16_rtz_groupE,fwd_hd192x128_bf16_rtz_group.co
bf16,192,128,2,1,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtne_groupE,fwd_hd192x128_bf16_causal_rtne_group.co
bf16,192,128,2,1,1,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtna_groupE,fwd_hd192x128_bf16_causal_rtna_group.co
bf16,192,128,2,1,2,128,32,_ZN5aiter40fmha_fwd_hd192x128_bf16_causal_rtz_groupE,fwd_hd192x128_bf16_causal_rtz_group.co
fp8bf16,128,128,0,0,1,256,64,_ZN5aiter18fmha_fwd_hd128_fp8E,fwd_hd128_fp8.co
fp8bf16,128,128,2,0,1,256,64,_ZN5aiter25fmha_fwd_hd128_fp8_causalE,fwd_hd128_fp8_causal.co
fp8bf16,128,128,0,1,1,256,64,_ZN5aiter24fmha_fwd_hd128_fp8_groupE,fwd_hd128_fp8_group.co
fp8bf16,128,128,2,1,1,256,64,_ZN5aiter31fmha_fwd_hd128_fp8_causal_groupE,fwd_hd128_fp8_causal_group.co
dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,qscale,ts_qo,ts_kv,knl_name,co_name
bf16,128,128,0,0,0,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtneE,fwd_hd128_bf16_rtne.co
bf16,128,128,0,0,1,0,256,32,_ZN5aiter24fmha_fwd_hd128_bf16_rtnaE,fwd_hd128_bf16_rtna.co
bf16,128,128,0,0,2,0,256,32,_ZN5aiter23fmha_fwd_hd128_bf16_rtzE,fwd_hd128_bf16_rtz.co
bf16,128,128,2,0,0,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtneE,fwd_hd128_bf16_causal_rtne.co
bf16,128,128,2,0,1,0,256,32,_ZN5aiter31fmha_fwd_hd128_bf16_causal_rtnaE,fwd_hd128_bf16_causal_rtna.co
bf16,128,128,2,0,2,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_causal_rtzE,fwd_hd128_bf16_causal_rtz.co
bf16,128,128,0,1,0,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtne_groupE,fwd_hd128_bf16_rtne_group.co
bf16,128,128,0,1,1,0,256,32,_ZN5aiter30fmha_fwd_hd128_bf16_rtna_groupE,fwd_hd128_bf16_rtna_group.co
bf16,128,128,0,1,2,0,256,32,_ZN5aiter29fmha_fwd_hd128_bf16_rtz_groupE,fwd_hd128_bf16_rtz_group.co
bf16,128,128,2,1,0,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtne_groupE,fwd_hd128_bf16_causal_rtne_group.co
bf16,128,128,2,1,1,0,256,32,_ZN5aiter37fmha_fwd_hd128_bf16_causal_rtna_groupE,fwd_hd128_bf16_causal_rtna_group.co
bf16,128,128,2,1,2,0,256,32,_ZN5aiter36fmha_fwd_hd128_bf16_causal_rtz_groupE,fwd_hd128_bf16_causal_rtz_group.co
bf16,192,128,0,0,0,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtneE,fwd_hd192x128_bf16_rtne.co
bf16,192,128,0,0,1,0,128,32,_ZN5aiter28fmha_fwd_hd192x128_bf16_rtnaE,fwd_hd192x128_bf16_rtna.co
bf16,192,128,0,0,2,0,128,32,_ZN5aiter27fmha_fwd_hd192x128_bf16_rtzE,fwd_hd192x128_bf16_rtz.co
bf16,192,128,2,0,0,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtneE,fwd_hd192x128_bf16_causal_rtne.co
bf16,192,128,2,0,1,0,128,32,_ZN5aiter35fmha_fwd_hd192x128_bf16_causal_rtnaE,fwd_hd192x128_bf16_causal_rtna.co
bf16,192,128,2,0,2,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_causal_rtzE,fwd_hd192x128_bf16_causal_rtz.co
bf16,192,128,0,1,0,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtne_groupE,fwd_hd192x128_bf16_rtne_group.co
bf16,192,128,0,1,1,0,128,32,_ZN5aiter34fmha_fwd_hd192x128_bf16_rtna_groupE,fwd_hd192x128_bf16_rtna_group.co
bf16,192,128,0,1,2,0,128,32,_ZN5aiter33fmha_fwd_hd192x128_bf16_rtz_groupE,fwd_hd192x128_bf16_rtz_group.co
bf16,192,128,2,1,0,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtne_groupE,fwd_hd192x128_bf16_causal_rtne_group.co
bf16,192,128,2,1,1,0,128,32,_ZN5aiter41fmha_fwd_hd192x128_bf16_causal_rtna_groupE,fwd_hd192x128_bf16_causal_rtna_group.co
bf16,192,128,2,1,2,0,128,32,_ZN5aiter40fmha_fwd_hd192x128_bf16_causal_rtz_groupE,fwd_hd192x128_bf16_causal_rtz_group.co
fp8bf16,128,128,0,0,1,0,256,64,_ZN5aiter18fmha_fwd_hd128_fp8E,fwd_hd128_fp8.co
fp8bf16,128,128,2,0,1,0,256,64,_ZN5aiter25fmha_fwd_hd128_fp8_causalE,fwd_hd128_fp8_causal.co
fp8bf16,128,128,0,1,1,0,256,64,_ZN5aiter24fmha_fwd_hd128_fp8_groupE,fwd_hd128_fp8_group.co
fp8bf16,128,128,2,1,1,0,256,64,_ZN5aiter31fmha_fwd_hd128_fp8_causal_groupE,fwd_hd128_fp8_causal_group.co
fp8bf16,128,128,0,0,1,1,256,64,_ZN5aiter29fmha_fwd_hd128_fp8_qkptph_vphE,fwd_hd128_fp8_qkptph_vph.co
fp8bf16,128,128,2,0,1,1,256,64,_ZN5aiter36fmha_fwd_hd128_fp8_causal_qkptph_vphE,fwd_hd128_fp8_causal_qkptph_vph.co
fp8bf16,128,128,0,1,1,1,256,64,_ZN5aiter35fmha_fwd_hd128_fp8_qkptph_vph_groupE,fwd_hd128_fp8_qkptph_vph_group.co
fp8bf16,128,128,2,1,1,1,256,64,_ZN5aiter42fmha_fwd_hd128_fp8_causal_qkptph_vph_groupE,fwd_hd128_fp8_causal_qkptph_vph_group.co
Binary file added hsa/gfx942/fmha_v3_fwd/fwd_hd128_fp8_causal.co
Binary file not shown.
30 changes: 17 additions & 13 deletions hsa/gfx950/fmha_v3_fwd/fmha_fwd.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,ts_qo,ts_kv,knl_name,co_name
bf16,128,128,0,0,0,256,64,_ZN5aiter19fmha_fwd_hd128_bf16E,fwd_hd128_bf16.co
bf16,128,128,2,0,0,256,64,_ZN5aiter26fmha_fwd_hd128_bf16_causalE,fwd_hd128_bf16_causal.co
bf16,128,128,0,1,0,256,64,_ZN5aiter25fmha_fwd_hd128_bf16_groupE,fwd_hd128_bf16_group.co
bf16,128,128,2,1,0,256,64,_ZN5aiter32fmha_fwd_hd128_bf16_causal_groupE,fwd_hd128_bf16_causal_group.co
bf16,192,128,0,0,0,128,128,_ZN5aiter25fmha_fwd_hd192_hd128_bf16E,fwd_hd192_hd128_bf16.co
bf16,192,128,2,0,0,128,128,_ZN5aiter32fmha_fwd_hd192_hd128_bf16_causalE,fwd_hd192_hd128_bf16_causal.co
bf16,192,128,0,1,0,128,128,_ZN5aiter31fmha_fwd_hd192_hd128_bf16_groupE,fwd_hd192_hd128_bf16_group.co
bf16,192,128,2,1,0,128,128,_ZN5aiter38fmha_fwd_hd192_hd128_bf16_causal_groupE,fwd_hd192_hd128_bf16_causal_group.co
fp8bf16,128,128,0,0,0,256,128,_ZN5aiter24fmha_fwd_hd128_fp8_gfx950E,fwd_hd128_fp8.co
fp8bf16,128,128,2,0,0,256,128,_ZN5aiter31fmha_fwd_hd128_fp8_causal_gfx950E,fwd_hd128_fp8_causal.co
fp8bf16,128,128,0,1,0,256,128,_ZN5aiter30fmha_fwd_hd128_fp8_group_gfx950E,fwd_hd128_fp8_group.co
fp8bf16,128,128,2,1,0,256,128,_ZN5aiter37fmha_fwd_hd128_fp8_causal_group_gfx950E,fwd_hd128_fp8_causal_group.co
dtype,hdim_q,hdim_v,mask,mode,bf16_cvt,qscale,ts_qo,ts_kv,knl_name,co_name
bf16,128,128,0,0,0,0,256,64,_ZN5aiter19fmha_fwd_hd128_bf16E,fwd_hd128_bf16.co
bf16,128,128,2,0,0,0,256,64,_ZN5aiter26fmha_fwd_hd128_bf16_causalE,fwd_hd128_bf16_causal.co
bf16,128,128,0,1,0,0,256,64,_ZN5aiter25fmha_fwd_hd128_bf16_groupE,fwd_hd128_bf16_group.co
bf16,128,128,2,1,0,0,256,64,_ZN5aiter32fmha_fwd_hd128_bf16_causal_groupE,fwd_hd128_bf16_causal_group.co
bf16,192,128,0,0,0,0,128,128,_ZN5aiter25fmha_fwd_hd192_hd128_bf16E,fwd_hd192_hd128_bf16.co
bf16,192,128,2,0,0,0,128,128,_ZN5aiter32fmha_fwd_hd192_hd128_bf16_causalE,fwd_hd192_hd128_bf16_causal.co
bf16,192,128,0,1,0,0,128,128,_ZN5aiter31fmha_fwd_hd192_hd128_bf16_groupE,fwd_hd192_hd128_bf16_group.co
bf16,192,128,2,1,0,0,128,128,_ZN5aiter38fmha_fwd_hd192_hd128_bf16_causal_groupE,fwd_hd192_hd128_bf16_causal_group.co
fp8bf16,128,128,0,0,0,0,256,128,_ZN5aiter24fmha_fwd_hd128_fp8_gfx950E,fwd_hd128_fp8.co
fp8bf16,128,128,2,0,0,0,256,128,_ZN5aiter31fmha_fwd_hd128_fp8_causal_gfx950E,fwd_hd128_fp8_causal.co
fp8bf16,128,128,0,1,0,0,256,128,_ZN5aiter30fmha_fwd_hd128_fp8_group_gfx950E,fwd_hd128_fp8_group.co
fp8bf16,128,128,2,1,0,0,256,128,_ZN5aiter37fmha_fwd_hd128_fp8_causal_group_gfx950E,fwd_hd128_fp8_causal_group.co
fp8bf16,128,128,0,0,0,1,256,128,_ZN5aiter36fmha_fwd_hd128_fp8_qkptph_vph_gfx950E,fwd_hd128_fp8_qkptph_vph.co
fp8bf16,128,128,2,0,0,1,256,128,_ZN5aiter43fmha_fwd_hd128_fp8_causal_qkptph_vph_gfx950E,fwd_hd128_fp8_causal_qkptph_vph.co
fp8bf16,128,128,0,1,0,1,256,128,_ZN5aiter42fmha_fwd_hd128_fp8_qkptph_vph_group_gfx950E,fwd_hd128_fp8_qkptph_vph_group.co
fp8bf16,128,128,2,1,0,1,256,128,_ZN5aiter49fmha_fwd_hd128_fp8_causal_qkptph_vph_group_gfx950E,fwd_hd128_fp8_causal_qkptph_vph_group.co
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading